1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 #include <asm/msr.h> 10 11 #include "x86.h" 12 #include "cpuid.h" 13 #include "hyperv.h" 14 #include "mmu.h" 15 #include "nested.h" 16 #include "pmu.h" 17 #include "posted_intr.h" 18 #include "sgx.h" 19 #include "trace.h" 20 #include "vmx.h" 21 #include "smm.h" 22 #include "x86_ops.h" 23 24 static bool __read_mostly enable_shadow_vmcs = 1; 25 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 26 27 static bool __ro_after_init warn_on_missed_cc; 28 module_param(warn_on_missed_cc, bool, 0444); 29 30 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 31 32 /* 33 * Hyper-V requires all of these, so mark them as supported even though 34 * they are just treated the same as all-context. 35 */ 36 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 37 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 38 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 39 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 40 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 41 42 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 43 44 enum { 45 VMX_VMREAD_BITMAP, 46 VMX_VMWRITE_BITMAP, 47 VMX_BITMAP_NR 48 }; 49 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 50 51 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 52 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 53 54 struct shadow_vmcs_field { 55 u16 encoding; 56 u16 offset; 57 }; 58 static struct shadow_vmcs_field shadow_read_only_fields[] = { 59 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 60 #include "vmcs_shadow_fields.h" 61 }; 62 static int max_shadow_read_only_fields = 63 ARRAY_SIZE(shadow_read_only_fields); 64 65 static struct shadow_vmcs_field shadow_read_write_fields[] = { 66 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 67 #include "vmcs_shadow_fields.h" 68 }; 69 static int max_shadow_read_write_fields = 70 ARRAY_SIZE(shadow_read_write_fields); 71 72 static void init_vmcs_shadow_fields(void) 73 { 74 int i, j; 75 76 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 77 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 78 79 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 80 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 81 u16 field = entry.encoding; 82 83 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 84 (i + 1 == max_shadow_read_only_fields || 85 shadow_read_only_fields[i + 1].encoding != field + 1)) 86 pr_err("Missing field from shadow_read_only_field %x\n", 87 field + 1); 88 89 if (get_vmcs12_field_offset(field) < 0) 90 continue; 91 92 clear_bit(field, vmx_vmread_bitmap); 93 if (field & 1) 94 #ifdef CONFIG_X86_64 95 continue; 96 #else 97 entry.offset += sizeof(u32); 98 #endif 99 shadow_read_only_fields[j++] = entry; 100 } 101 max_shadow_read_only_fields = j; 102 103 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 104 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 105 u16 field = entry.encoding; 106 107 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 108 (i + 1 == max_shadow_read_write_fields || 109 shadow_read_write_fields[i + 1].encoding != field + 1)) 110 pr_err("Missing field from shadow_read_write_field %x\n", 111 field + 1); 112 113 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 114 field <= GUEST_TR_AR_BYTES, 115 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 116 117 if (get_vmcs12_field_offset(field) < 0) 118 continue; 119 120 /* 121 * KVM emulates PML and the VMX preemption timer irrespective 122 * of hardware support, but shadowing their related VMCS fields 123 * requires hardware support as the CPU will reject VMWRITEs to 124 * fields that don't exist. 125 */ 126 switch (field) { 127 case GUEST_PML_INDEX: 128 if (!cpu_has_vmx_pml()) 129 continue; 130 break; 131 case VMX_PREEMPTION_TIMER_VALUE: 132 if (!cpu_has_vmx_preemption_timer()) 133 continue; 134 break; 135 default: 136 break; 137 } 138 139 clear_bit(field, vmx_vmwrite_bitmap); 140 clear_bit(field, vmx_vmread_bitmap); 141 if (field & 1) 142 #ifdef CONFIG_X86_64 143 continue; 144 #else 145 entry.offset += sizeof(u32); 146 #endif 147 shadow_read_write_fields[j++] = entry; 148 } 149 max_shadow_read_write_fields = j; 150 } 151 152 /* 153 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 154 * set the success or error code of an emulated VMX instruction (as specified 155 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 156 * instruction. 157 */ 158 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 159 { 160 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 161 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 162 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 163 return kvm_skip_emulated_instruction(vcpu); 164 } 165 166 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 167 { 168 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 169 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 170 X86_EFLAGS_SF | X86_EFLAGS_OF)) 171 | X86_EFLAGS_CF); 172 return kvm_skip_emulated_instruction(vcpu); 173 } 174 175 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 176 u32 vm_instruction_error) 177 { 178 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 179 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 180 X86_EFLAGS_SF | X86_EFLAGS_OF)) 181 | X86_EFLAGS_ZF); 182 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 183 /* 184 * We don't need to force sync to shadow VMCS because 185 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 186 * fields and thus must be synced. 187 */ 188 if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) 189 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 190 191 return kvm_skip_emulated_instruction(vcpu); 192 } 193 194 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 195 { 196 struct vcpu_vmx *vmx = to_vmx(vcpu); 197 198 /* 199 * failValid writes the error number to the current VMCS, which 200 * can't be done if there isn't a current VMCS. 201 */ 202 if (vmx->nested.current_vmptr == INVALID_GPA && 203 !nested_vmx_is_evmptr12_valid(vmx)) 204 return nested_vmx_failInvalid(vcpu); 205 206 return nested_vmx_failValid(vcpu, vm_instruction_error); 207 } 208 209 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 210 { 211 /* TODO: not to reset guest simply here. */ 212 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 213 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 214 } 215 216 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 217 { 218 return fixed_bits_valid(control, low, high); 219 } 220 221 static inline u64 vmx_control_msr(u32 low, u32 high) 222 { 223 return low | ((u64)high << 32); 224 } 225 226 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 227 { 228 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 229 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 230 vmx->nested.need_vmcs12_to_shadow_sync = false; 231 } 232 233 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 234 { 235 #ifdef CONFIG_KVM_HYPERV 236 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 237 struct vcpu_vmx *vmx = to_vmx(vcpu); 238 239 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); 240 vmx->nested.hv_evmcs = NULL; 241 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 242 243 if (hv_vcpu) { 244 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 245 hv_vcpu->nested.vm_id = 0; 246 hv_vcpu->nested.vp_id = 0; 247 } 248 #endif 249 } 250 251 static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) 252 { 253 #ifdef CONFIG_KVM_HYPERV 254 struct vcpu_vmx *vmx = to_vmx(vcpu); 255 /* 256 * When Enlightened VMEntry is enabled on the calling CPU we treat 257 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 258 * way to distinguish it from VMCS12) and we must not corrupt it by 259 * writing to the non-existent 'launch_state' field. The area doesn't 260 * have to be the currently active EVMCS on the calling CPU and there's 261 * nothing KVM has to do to transition it from 'active' to 'non-active' 262 * state. It is possible that the area will stay mapped as 263 * vmx->nested.hv_evmcs but this shouldn't be a problem. 264 */ 265 if (!guest_cpu_cap_has_evmcs(vcpu) || 266 !evmptr_is_valid(nested_get_evmptr(vcpu))) 267 return false; 268 269 if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) 270 nested_release_evmcs(vcpu); 271 272 return true; 273 #else 274 return false; 275 #endif 276 } 277 278 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 279 struct loaded_vmcs *prev) 280 { 281 struct vmcs_host_state *dest, *src; 282 283 if (unlikely(!vmx->vt.guest_state_loaded)) 284 return; 285 286 src = &prev->host_state; 287 dest = &vmx->loaded_vmcs->host_state; 288 289 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 290 dest->ldt_sel = src->ldt_sel; 291 #ifdef CONFIG_X86_64 292 dest->ds_sel = src->ds_sel; 293 dest->es_sel = src->es_sel; 294 #endif 295 } 296 297 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 298 { 299 struct vcpu_vmx *vmx = to_vmx(vcpu); 300 struct loaded_vmcs *prev; 301 int cpu; 302 303 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 304 return; 305 306 cpu = get_cpu(); 307 prev = vmx->loaded_vmcs; 308 vmx->loaded_vmcs = vmcs; 309 vmx_vcpu_load_vmcs(vcpu, cpu); 310 vmx_sync_vmcs_host_state(vmx, prev); 311 put_cpu(); 312 313 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 314 315 /* 316 * All lazily updated registers will be reloaded from VMCS12 on both 317 * vmentry and vmexit. 318 */ 319 vcpu->arch.regs_dirty = 0; 320 } 321 322 static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) 323 { 324 struct vcpu_vmx *vmx = to_vmx(vcpu); 325 326 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); 327 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); 328 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); 329 vmx->nested.pi_desc = NULL; 330 } 331 332 /* 333 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 334 * just stops using VMX. 335 */ 336 static void free_nested(struct kvm_vcpu *vcpu) 337 { 338 struct vcpu_vmx *vmx = to_vmx(vcpu); 339 340 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 341 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 342 343 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 344 return; 345 346 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 347 348 vmx->nested.vmxon = false; 349 vmx->nested.smm.vmxon = false; 350 vmx->nested.vmxon_ptr = INVALID_GPA; 351 free_vpid(vmx->nested.vpid02); 352 vmx->nested.posted_intr_nv = -1; 353 vmx->nested.current_vmptr = INVALID_GPA; 354 if (enable_shadow_vmcs) { 355 vmx_disable_shadow_vmcs(vmx); 356 vmcs_clear(vmx->vmcs01.shadow_vmcs); 357 free_vmcs(vmx->vmcs01.shadow_vmcs); 358 vmx->vmcs01.shadow_vmcs = NULL; 359 } 360 kfree(vmx->nested.cached_vmcs12); 361 vmx->nested.cached_vmcs12 = NULL; 362 kfree(vmx->nested.cached_shadow_vmcs12); 363 vmx->nested.cached_shadow_vmcs12 = NULL; 364 365 nested_put_vmcs12_pages(vcpu); 366 367 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 368 369 nested_release_evmcs(vcpu); 370 371 free_loaded_vmcs(&vmx->nested.vmcs02); 372 } 373 374 /* 375 * Ensure that the current vmcs of the logical processor is the 376 * vmcs01 of the vcpu before calling free_nested(). 377 */ 378 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 379 { 380 vcpu_load(vcpu); 381 vmx_leave_nested(vcpu); 382 vcpu_put(vcpu); 383 } 384 385 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 386 387 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 388 { 389 return VALID_PAGE(root_hpa) && 390 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 391 } 392 393 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 394 gpa_t addr) 395 { 396 unsigned long roots = 0; 397 uint i; 398 struct kvm_mmu_root_info *cached_root; 399 400 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 401 402 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 403 cached_root = &vcpu->arch.mmu->prev_roots[i]; 404 405 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 406 eptp)) 407 roots |= KVM_MMU_ROOT_PREVIOUS(i); 408 } 409 if (roots) 410 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 411 } 412 413 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 414 struct x86_exception *fault) 415 { 416 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 417 struct vcpu_vmx *vmx = to_vmx(vcpu); 418 unsigned long exit_qualification; 419 u32 vm_exit_reason; 420 421 if (vmx->nested.pml_full) { 422 vm_exit_reason = EXIT_REASON_PML_FULL; 423 vmx->nested.pml_full = false; 424 425 /* 426 * It should be impossible to trigger a nested PML Full VM-Exit 427 * for anything other than an EPT Violation from L2. KVM *can* 428 * trigger nEPT page fault injection in response to an EPT 429 * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT 430 * tables also changed, but KVM should not treat EPT Misconfig 431 * VM-Exits as writes. 432 */ 433 WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); 434 435 /* 436 * PML Full and EPT Violation VM-Exits both use bit 12 to report 437 * "NMI unblocking due to IRET", i.e. the bit can be propagated 438 * as-is from the original EXIT_QUALIFICATION. 439 */ 440 exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; 441 } else { 442 if (fault->error_code & PFERR_RSVD_MASK) { 443 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 444 exit_qualification = 0; 445 } else { 446 exit_qualification = fault->exit_qualification; 447 exit_qualification |= vmx_get_exit_qual(vcpu) & 448 (EPT_VIOLATION_GVA_IS_VALID | 449 EPT_VIOLATION_GVA_TRANSLATED); 450 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 451 } 452 453 /* 454 * Although the caller (kvm_inject_emulated_page_fault) would 455 * have already synced the faulting address in the shadow EPT 456 * tables for the current EPTP12, we also need to sync it for 457 * any other cached EPTP02s based on the same EP4TA, since the 458 * TLB associates mappings to the EP4TA rather than the full EPTP. 459 */ 460 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 461 fault->address); 462 } 463 464 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 465 vmcs12->guest_physical_address = fault->address; 466 } 467 468 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 469 { 470 struct vcpu_vmx *vmx = to_vmx(vcpu); 471 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 472 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 473 474 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 475 nested_ept_ad_enabled(vcpu), 476 nested_ept_get_eptp(vcpu)); 477 } 478 479 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 480 { 481 WARN_ON(mmu_is_nested(vcpu)); 482 483 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 484 nested_ept_new_eptp(vcpu); 485 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 486 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 487 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 488 489 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 490 } 491 492 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 493 { 494 vcpu->arch.mmu = &vcpu->arch.root_mmu; 495 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 496 } 497 498 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 499 u16 error_code) 500 { 501 bool inequality, bit; 502 503 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 504 inequality = 505 (error_code & vmcs12->page_fault_error_code_mask) != 506 vmcs12->page_fault_error_code_match; 507 return inequality ^ bit; 508 } 509 510 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 511 u32 error_code) 512 { 513 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 514 515 /* 516 * Drop bits 31:16 of the error code when performing the #PF mask+match 517 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 518 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 519 * error code. Including the to-be-dropped bits in the check might 520 * result in an "impossible" or missed exit from L1's perspective. 521 */ 522 if (vector == PF_VECTOR) 523 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 524 525 return (vmcs12->exception_bitmap & (1u << vector)); 526 } 527 528 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 529 struct vmcs12 *vmcs12) 530 { 531 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 532 return 0; 533 534 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 535 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 536 return -EINVAL; 537 538 return 0; 539 } 540 541 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 542 struct vmcs12 *vmcs12) 543 { 544 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 545 return 0; 546 547 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 548 return -EINVAL; 549 550 return 0; 551 } 552 553 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 554 struct vmcs12 *vmcs12) 555 { 556 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 557 return 0; 558 559 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 560 return -EINVAL; 561 562 if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4)) 563 return -EINVAL; 564 565 return 0; 566 } 567 568 /* 569 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 570 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 571 * only the "disable intercept" case needs to be handled. 572 */ 573 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 574 unsigned long *msr_bitmap_l0, 575 u32 msr, int type) 576 { 577 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 578 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 579 580 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 581 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 582 } 583 584 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 585 { 586 int msr; 587 588 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 589 unsigned word = msr / BITS_PER_LONG; 590 591 msr_bitmap[word] = ~0; 592 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 593 } 594 } 595 596 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 597 static inline \ 598 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 599 unsigned long *msr_bitmap_l1, \ 600 unsigned long *msr_bitmap_l0, u32 msr) \ 601 { \ 602 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 603 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 604 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 605 else \ 606 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 607 } 608 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 609 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 610 611 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 612 unsigned long *msr_bitmap_l1, 613 unsigned long *msr_bitmap_l0, 614 u32 msr, int types) 615 { 616 if (types & MSR_TYPE_R) 617 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 618 msr_bitmap_l0, msr); 619 if (types & MSR_TYPE_W) 620 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 621 msr_bitmap_l0, msr); 622 } 623 624 /* 625 * Merge L0's and L1's MSR bitmap, return false to indicate that 626 * we do not use the hardware. 627 */ 628 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 629 struct vmcs12 *vmcs12) 630 { 631 struct vcpu_vmx *vmx = to_vmx(vcpu); 632 int msr; 633 unsigned long *msr_bitmap_l1; 634 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 635 struct kvm_host_map map; 636 637 /* Nothing to do if the MSR bitmap is not in use. */ 638 if (!cpu_has_vmx_msr_bitmap() || 639 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 640 return false; 641 642 /* 643 * MSR bitmap update can be skipped when: 644 * - MSR bitmap for L1 hasn't changed. 645 * - Nested hypervisor (L1) is attempting to launch the same L2 as 646 * before. 647 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 648 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 649 */ 650 if (!vmx->nested.force_msr_bitmap_recalc) { 651 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 652 653 if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && 654 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 655 return true; 656 } 657 658 if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) 659 return false; 660 661 msr_bitmap_l1 = (unsigned long *)map.hva; 662 663 /* 664 * To keep the control flow simple, pay eight 8-byte writes (sixteen 665 * 4-byte writes on 32-bit systems) up front to enable intercepts for 666 * the x2APIC MSR range and selectively toggle those relevant to L2. 667 */ 668 enable_x2apic_msr_intercepts(msr_bitmap_l0); 669 670 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 671 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 672 /* 673 * L0 need not intercept reads for MSRs between 0x800 674 * and 0x8ff, it just lets the processor take the value 675 * from the virtual-APIC page; take those 256 bits 676 * directly from the L1 bitmap. 677 */ 678 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 679 unsigned word = msr / BITS_PER_LONG; 680 681 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 682 } 683 } 684 685 nested_vmx_disable_intercept_for_x2apic_msr( 686 msr_bitmap_l1, msr_bitmap_l0, 687 X2APIC_MSR(APIC_TASKPRI), 688 MSR_TYPE_R | MSR_TYPE_W); 689 690 if (nested_cpu_has_vid(vmcs12)) { 691 nested_vmx_disable_intercept_for_x2apic_msr( 692 msr_bitmap_l1, msr_bitmap_l0, 693 X2APIC_MSR(APIC_EOI), 694 MSR_TYPE_W); 695 nested_vmx_disable_intercept_for_x2apic_msr( 696 msr_bitmap_l1, msr_bitmap_l0, 697 X2APIC_MSR(APIC_SELF_IPI), 698 MSR_TYPE_W); 699 } 700 } 701 702 /* 703 * Always check vmcs01's bitmap to honor userspace MSR filters and any 704 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 705 */ 706 #ifdef CONFIG_X86_64 707 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 708 MSR_FS_BASE, MSR_TYPE_RW); 709 710 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 711 MSR_GS_BASE, MSR_TYPE_RW); 712 713 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 714 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 715 #endif 716 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 717 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 718 719 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 720 MSR_IA32_PRED_CMD, MSR_TYPE_W); 721 722 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 723 MSR_IA32_FLUSH_CMD, MSR_TYPE_W); 724 725 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 726 MSR_IA32_APERF, MSR_TYPE_R); 727 728 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 729 MSR_IA32_MPERF, MSR_TYPE_R); 730 731 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 732 MSR_IA32_U_CET, MSR_TYPE_RW); 733 734 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 735 MSR_IA32_S_CET, MSR_TYPE_RW); 736 737 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 738 MSR_IA32_PL0_SSP, MSR_TYPE_RW); 739 740 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 741 MSR_IA32_PL1_SSP, MSR_TYPE_RW); 742 743 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 744 MSR_IA32_PL2_SSP, MSR_TYPE_RW); 745 746 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 747 MSR_IA32_PL3_SSP, MSR_TYPE_RW); 748 749 kvm_vcpu_unmap(vcpu, &map); 750 751 vmx->nested.force_msr_bitmap_recalc = false; 752 753 return true; 754 } 755 756 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 757 struct vmcs12 *vmcs12) 758 { 759 struct vcpu_vmx *vmx = to_vmx(vcpu); 760 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 761 762 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 763 vmcs12->vmcs_link_pointer == INVALID_GPA) 764 return; 765 766 if (ghc->gpa != vmcs12->vmcs_link_pointer && 767 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 768 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 769 return; 770 771 kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), 772 VMCS12_SIZE); 773 } 774 775 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 776 struct vmcs12 *vmcs12) 777 { 778 struct vcpu_vmx *vmx = to_vmx(vcpu); 779 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 780 781 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 782 vmcs12->vmcs_link_pointer == INVALID_GPA) 783 return; 784 785 if (ghc->gpa != vmcs12->vmcs_link_pointer && 786 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 787 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 788 return; 789 790 kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), 791 VMCS12_SIZE); 792 } 793 794 /* 795 * In nested virtualization, check if L1 has set 796 * VM_EXIT_ACK_INTR_ON_EXIT 797 */ 798 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 799 { 800 return get_vmcs12(vcpu)->vm_exit_controls & 801 VM_EXIT_ACK_INTR_ON_EXIT; 802 } 803 804 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 805 struct vmcs12 *vmcs12) 806 { 807 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 808 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 809 return -EINVAL; 810 else 811 return 0; 812 } 813 814 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 815 struct vmcs12 *vmcs12) 816 { 817 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 818 !nested_cpu_has_apic_reg_virt(vmcs12) && 819 !nested_cpu_has_vid(vmcs12) && 820 !nested_cpu_has_posted_intr(vmcs12)) 821 return 0; 822 823 /* 824 * If virtualize x2apic mode is enabled, 825 * virtualize apic access must be disabled. 826 */ 827 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 828 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 829 return -EINVAL; 830 831 /* 832 * If virtual interrupt delivery is enabled, 833 * we must exit on external interrupts. 834 */ 835 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 836 return -EINVAL; 837 838 /* 839 * bits 15:8 should be zero in posted_intr_nv, 840 * the descriptor address has been already checked 841 * in nested_get_vmcs12_pages. 842 * 843 * bits 5:0 of posted_intr_desc_addr should be zero. 844 */ 845 if (nested_cpu_has_posted_intr(vmcs12) && 846 (CC(!nested_cpu_has_vid(vmcs12)) || 847 CC(!nested_exit_intr_ack_set(vcpu)) || 848 CC((vmcs12->posted_intr_nv & 0xff00)) || 849 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 850 return -EINVAL; 851 852 /* tpr shadow is needed by all apicv features. */ 853 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 854 return -EINVAL; 855 856 return 0; 857 } 858 859 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 860 { 861 struct vcpu_vmx *vmx = to_vmx(vcpu); 862 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 863 vmx->nested.msrs.misc_high); 864 865 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 866 } 867 868 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 869 u32 count, u64 addr) 870 { 871 if (count == 0) 872 return 0; 873 874 /* 875 * Exceeding the limit results in architecturally _undefined_ behavior, 876 * i.e. KVM is allowed to do literally anything in response to a bad 877 * limit. Immediately generate a consistency check so that code that 878 * consumes the count doesn't need to worry about extreme edge cases. 879 */ 880 if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) 881 return -EINVAL; 882 883 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 884 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 885 return -EINVAL; 886 887 return 0; 888 } 889 890 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 891 struct vmcs12 *vmcs12) 892 { 893 if (CC(nested_vmx_check_msr_switch(vcpu, 894 vmcs12->vm_exit_msr_load_count, 895 vmcs12->vm_exit_msr_load_addr)) || 896 CC(nested_vmx_check_msr_switch(vcpu, 897 vmcs12->vm_exit_msr_store_count, 898 vmcs12->vm_exit_msr_store_addr))) 899 return -EINVAL; 900 901 return 0; 902 } 903 904 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 905 struct vmcs12 *vmcs12) 906 { 907 if (CC(nested_vmx_check_msr_switch(vcpu, 908 vmcs12->vm_entry_msr_load_count, 909 vmcs12->vm_entry_msr_load_addr))) 910 return -EINVAL; 911 912 return 0; 913 } 914 915 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 916 struct vmcs12 *vmcs12) 917 { 918 if (!nested_cpu_has_pml(vmcs12)) 919 return 0; 920 921 if (CC(!nested_cpu_has_ept(vmcs12)) || 922 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 923 return -EINVAL; 924 925 return 0; 926 } 927 928 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 929 struct vmcs12 *vmcs12) 930 { 931 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 932 !nested_cpu_has_ept(vmcs12))) 933 return -EINVAL; 934 return 0; 935 } 936 937 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 938 struct vmcs12 *vmcs12) 939 { 940 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 941 !nested_cpu_has_ept(vmcs12))) 942 return -EINVAL; 943 return 0; 944 } 945 946 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 947 struct vmcs12 *vmcs12) 948 { 949 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 950 return 0; 951 952 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 953 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 954 return -EINVAL; 955 956 return 0; 957 } 958 959 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 960 struct vmx_msr_entry *e) 961 { 962 /* x2APIC MSR accesses are not allowed */ 963 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 964 return -EINVAL; 965 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 966 CC(e->index == MSR_IA32_UCODE_REV)) 967 return -EINVAL; 968 if (CC(e->reserved != 0)) 969 return -EINVAL; 970 return 0; 971 } 972 973 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 974 struct vmx_msr_entry *e) 975 { 976 if (CC(e->index == MSR_FS_BASE) || 977 CC(e->index == MSR_GS_BASE) || 978 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 979 nested_vmx_msr_check_common(vcpu, e)) 980 return -EINVAL; 981 return 0; 982 } 983 984 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 985 struct vmx_msr_entry *e) 986 { 987 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 988 nested_vmx_msr_check_common(vcpu, e)) 989 return -EINVAL; 990 return 0; 991 } 992 993 /* 994 * Load guest's/host's msr at nested entry/exit. 995 * return 0 for success, entry index for failure. 996 * 997 * One of the failure modes for MSR load/store is when a list exceeds the 998 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 999 * as possible, process all valid entries before failing rather than precheck 1000 * for a capacity violation. 1001 */ 1002 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1003 { 1004 u32 i; 1005 struct vmx_msr_entry e; 1006 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1007 1008 for (i = 0; i < count; i++) { 1009 if (WARN_ON_ONCE(i >= max_msr_list_size)) 1010 goto fail; 1011 1012 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 1013 &e, sizeof(e))) { 1014 pr_debug_ratelimited( 1015 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1016 __func__, i, gpa + i * sizeof(e)); 1017 goto fail; 1018 } 1019 if (nested_vmx_load_msr_check(vcpu, &e)) { 1020 pr_debug_ratelimited( 1021 "%s check failed (%u, 0x%x, 0x%x)\n", 1022 __func__, i, e.index, e.reserved); 1023 goto fail; 1024 } 1025 if (kvm_emulate_msr_write(vcpu, e.index, e.value)) { 1026 pr_debug_ratelimited( 1027 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1028 __func__, i, e.index, e.value); 1029 goto fail; 1030 } 1031 } 1032 return 0; 1033 fail: 1034 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 1035 return i + 1; 1036 } 1037 1038 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 1039 u32 msr_index, 1040 u64 *data) 1041 { 1042 struct vcpu_vmx *vmx = to_vmx(vcpu); 1043 1044 /* 1045 * If the L0 hypervisor stored a more accurate value for the TSC that 1046 * does not include the time taken for emulation of the L2->L1 1047 * VM-exit in L0, use the more accurate value. 1048 */ 1049 if (msr_index == MSR_IA32_TSC) { 1050 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 1051 MSR_IA32_TSC); 1052 1053 if (i >= 0) { 1054 u64 val = vmx->msr_autostore.guest.val[i].value; 1055 1056 *data = kvm_read_l1_tsc(vcpu, val); 1057 return true; 1058 } 1059 } 1060 1061 if (kvm_emulate_msr_read(vcpu, msr_index, data)) { 1062 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1063 msr_index); 1064 return false; 1065 } 1066 return true; 1067 } 1068 1069 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 1070 struct vmx_msr_entry *e) 1071 { 1072 if (kvm_vcpu_read_guest(vcpu, 1073 gpa + i * sizeof(*e), 1074 e, 2 * sizeof(u32))) { 1075 pr_debug_ratelimited( 1076 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1077 __func__, i, gpa + i * sizeof(*e)); 1078 return false; 1079 } 1080 if (nested_vmx_store_msr_check(vcpu, e)) { 1081 pr_debug_ratelimited( 1082 "%s check failed (%u, 0x%x, 0x%x)\n", 1083 __func__, i, e->index, e->reserved); 1084 return false; 1085 } 1086 return true; 1087 } 1088 1089 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1090 { 1091 u64 data; 1092 u32 i; 1093 struct vmx_msr_entry e; 1094 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1095 1096 for (i = 0; i < count; i++) { 1097 if (WARN_ON_ONCE(i >= max_msr_list_size)) 1098 return -EINVAL; 1099 1100 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1101 return -EINVAL; 1102 1103 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1104 return -EINVAL; 1105 1106 if (kvm_vcpu_write_guest(vcpu, 1107 gpa + i * sizeof(e) + 1108 offsetof(struct vmx_msr_entry, value), 1109 &data, sizeof(data))) { 1110 pr_debug_ratelimited( 1111 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1112 __func__, i, e.index, data); 1113 return -EINVAL; 1114 } 1115 } 1116 return 0; 1117 } 1118 1119 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1120 { 1121 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1122 u32 count = vmcs12->vm_exit_msr_store_count; 1123 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1124 struct vmx_msr_entry e; 1125 u32 i; 1126 1127 for (i = 0; i < count; i++) { 1128 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1129 return false; 1130 1131 if (e.index == msr_index) 1132 return true; 1133 } 1134 return false; 1135 } 1136 1137 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1138 u32 msr_index) 1139 { 1140 struct vcpu_vmx *vmx = to_vmx(vcpu); 1141 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1142 bool in_vmcs12_store_list; 1143 int msr_autostore_slot; 1144 bool in_autostore_list; 1145 int last; 1146 1147 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1148 in_autostore_list = msr_autostore_slot >= 0; 1149 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1150 1151 if (in_vmcs12_store_list && !in_autostore_list) { 1152 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1153 /* 1154 * Emulated VMEntry does not fail here. Instead a less 1155 * accurate value will be returned by 1156 * nested_vmx_get_vmexit_msr_value() by reading KVM's 1157 * internal MSR state instead of reading the value from 1158 * the vmcs02 VMExit MSR-store area. 1159 */ 1160 pr_warn_ratelimited( 1161 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1162 msr_index); 1163 return; 1164 } 1165 last = autostore->nr++; 1166 autostore->val[last].index = msr_index; 1167 } else if (!in_vmcs12_store_list && in_autostore_list) { 1168 last = --autostore->nr; 1169 autostore->val[msr_autostore_slot] = autostore->val[last]; 1170 } 1171 } 1172 1173 /* 1174 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1175 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1176 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1177 * @entry_failure_code. 1178 */ 1179 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1180 bool nested_ept, bool reload_pdptrs, 1181 enum vm_entry_failure_code *entry_failure_code) 1182 { 1183 if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { 1184 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1185 return -EINVAL; 1186 } 1187 1188 /* 1189 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1190 * must not be dereferenced. 1191 */ 1192 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1193 CC(!load_pdptrs(vcpu, cr3))) { 1194 *entry_failure_code = ENTRY_FAIL_PDPTE; 1195 return -EINVAL; 1196 } 1197 1198 vcpu->arch.cr3 = cr3; 1199 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1200 1201 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1202 kvm_init_mmu(vcpu); 1203 1204 if (!nested_ept) 1205 kvm_mmu_new_pgd(vcpu, cr3); 1206 1207 return 0; 1208 } 1209 1210 /* 1211 * Returns if KVM is able to config CPU to tag TLB entries 1212 * populated by L2 differently than TLB entries populated 1213 * by L1. 1214 * 1215 * If L0 uses EPT, L1 and L2 run with different EPTP because 1216 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1217 * are tagged with different EPTP. 1218 * 1219 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1220 * with different VPID (L1 entries are tagged with vmx->vpid 1221 * while L2 entries are tagged with vmx->nested.vpid02). 1222 */ 1223 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1224 { 1225 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1226 1227 return enable_ept || 1228 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1229 } 1230 1231 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1232 struct vmcs12 *vmcs12, 1233 bool is_vmenter) 1234 { 1235 struct vcpu_vmx *vmx = to_vmx(vcpu); 1236 1237 /* Handle pending Hyper-V TLB flush requests */ 1238 kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); 1239 1240 /* 1241 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 1242 * same VPID as the host, and so architecturally, linear and combined 1243 * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM 1244 * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, 1245 * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This 1246 * is required if VPID is disabled in KVM, as a TLB flush (there are no 1247 * VPIDs) still occurs from L1's perspective, and KVM may need to 1248 * synchronize the MMU in response to the guest TLB flush. 1249 * 1250 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1251 * EPT is a special snowflake, as guest-physical mappings aren't 1252 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1253 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1254 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1255 * those mappings. 1256 */ 1257 if (!nested_cpu_has_vpid(vmcs12)) { 1258 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1259 return; 1260 } 1261 1262 /* L2 should never have a VPID if VPID is disabled. */ 1263 WARN_ON(!enable_vpid); 1264 1265 /* 1266 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1267 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1268 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1269 * that the new vpid12 has never been used and thus represents a new 1270 * guest ASID that cannot have entries in the TLB. 1271 */ 1272 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1273 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1274 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1275 return; 1276 } 1277 1278 /* 1279 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1280 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1281 * KVM was unable to allocate a VPID for L2, flush the current context 1282 * as the effective ASID is common to both L1 and L2. 1283 */ 1284 if (!nested_has_guest_tlb_tag(vcpu)) 1285 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1286 } 1287 1288 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1289 { 1290 superset &= mask; 1291 subset &= mask; 1292 1293 return (superset | subset) == superset; 1294 } 1295 1296 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1297 { 1298 const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | 1299 VMX_BASIC_INOUT | 1300 VMX_BASIC_TRUE_CTLS | 1301 VMX_BASIC_NO_HW_ERROR_CODE_CC; 1302 1303 const u64 reserved_bits = GENMASK_ULL(63, 57) | 1304 GENMASK_ULL(47, 45) | 1305 BIT_ULL(31); 1306 1307 u64 vmx_basic = vmcs_config.nested.basic; 1308 1309 BUILD_BUG_ON(feature_bits & reserved_bits); 1310 1311 /* 1312 * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has 1313 * inverted polarity), the incoming value must not set feature bits or 1314 * reserved bits that aren't allowed/supported by KVM. Fields, i.e. 1315 * multi-bit values, are explicitly checked below. 1316 */ 1317 if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) 1318 return -EINVAL; 1319 1320 /* 1321 * KVM does not emulate a version of VMX that constrains physical 1322 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1323 */ 1324 if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 1325 return -EINVAL; 1326 1327 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1328 vmx_basic_vmcs_revision_id(data)) 1329 return -EINVAL; 1330 1331 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1332 return -EINVAL; 1333 1334 vmx->nested.msrs.basic = data; 1335 return 0; 1336 } 1337 1338 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1339 u32 **low, u32 **high) 1340 { 1341 switch (msr_index) { 1342 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1343 *low = &msrs->pinbased_ctls_low; 1344 *high = &msrs->pinbased_ctls_high; 1345 break; 1346 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1347 *low = &msrs->procbased_ctls_low; 1348 *high = &msrs->procbased_ctls_high; 1349 break; 1350 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1351 *low = &msrs->exit_ctls_low; 1352 *high = &msrs->exit_ctls_high; 1353 break; 1354 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1355 *low = &msrs->entry_ctls_low; 1356 *high = &msrs->entry_ctls_high; 1357 break; 1358 case MSR_IA32_VMX_PROCBASED_CTLS2: 1359 *low = &msrs->secondary_ctls_low; 1360 *high = &msrs->secondary_ctls_high; 1361 break; 1362 default: 1363 BUG(); 1364 } 1365 } 1366 1367 static int 1368 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1369 { 1370 u32 *lowp, *highp; 1371 u64 supported; 1372 1373 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1374 1375 supported = vmx_control_msr(*lowp, *highp); 1376 1377 /* Check must-be-1 bits are still 1. */ 1378 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1379 return -EINVAL; 1380 1381 /* Check must-be-0 bits are still 0. */ 1382 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1383 return -EINVAL; 1384 1385 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1386 *lowp = data; 1387 *highp = data >> 32; 1388 return 0; 1389 } 1390 1391 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1392 { 1393 const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | 1394 VMX_MISC_ACTIVITY_HLT | 1395 VMX_MISC_ACTIVITY_SHUTDOWN | 1396 VMX_MISC_ACTIVITY_WAIT_SIPI | 1397 VMX_MISC_INTEL_PT | 1398 VMX_MISC_RDMSR_IN_SMM | 1399 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 1400 VMX_MISC_VMXOFF_BLOCK_SMI | 1401 VMX_MISC_ZERO_LEN_INS; 1402 1403 const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); 1404 1405 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1406 vmcs_config.nested.misc_high); 1407 1408 BUILD_BUG_ON(feature_bits & reserved_bits); 1409 1410 /* 1411 * The incoming value must not set feature bits or reserved bits that 1412 * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are 1413 * explicitly checked below. 1414 */ 1415 if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) 1416 return -EINVAL; 1417 1418 if ((vmx->nested.msrs.pinbased_ctls_high & 1419 PIN_BASED_VMX_PREEMPTION_TIMER) && 1420 vmx_misc_preemption_timer_rate(data) != 1421 vmx_misc_preemption_timer_rate(vmx_misc)) 1422 return -EINVAL; 1423 1424 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1425 return -EINVAL; 1426 1427 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1428 return -EINVAL; 1429 1430 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1431 return -EINVAL; 1432 1433 vmx->nested.msrs.misc_low = data; 1434 vmx->nested.msrs.misc_high = data >> 32; 1435 1436 return 0; 1437 } 1438 1439 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1440 { 1441 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1442 vmcs_config.nested.vpid_caps); 1443 1444 /* Every bit is either reserved or a feature bit. */ 1445 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1446 return -EINVAL; 1447 1448 vmx->nested.msrs.ept_caps = data; 1449 vmx->nested.msrs.vpid_caps = data >> 32; 1450 return 0; 1451 } 1452 1453 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1454 { 1455 switch (msr_index) { 1456 case MSR_IA32_VMX_CR0_FIXED0: 1457 return &msrs->cr0_fixed0; 1458 case MSR_IA32_VMX_CR4_FIXED0: 1459 return &msrs->cr4_fixed0; 1460 default: 1461 BUG(); 1462 } 1463 } 1464 1465 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1466 { 1467 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1468 1469 /* 1470 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1471 * must be 1 in the restored value. 1472 */ 1473 if (!is_bitwise_subset(data, *msr, -1ULL)) 1474 return -EINVAL; 1475 1476 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1477 return 0; 1478 } 1479 1480 /* 1481 * Called when userspace is restoring VMX MSRs. 1482 * 1483 * Returns 0 on success, non-0 otherwise. 1484 */ 1485 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1486 { 1487 struct vcpu_vmx *vmx = to_vmx(vcpu); 1488 1489 /* 1490 * Don't allow changes to the VMX capability MSRs while the vCPU 1491 * is in VMX operation. 1492 */ 1493 if (vmx->nested.vmxon) 1494 return -EBUSY; 1495 1496 switch (msr_index) { 1497 case MSR_IA32_VMX_BASIC: 1498 return vmx_restore_vmx_basic(vmx, data); 1499 case MSR_IA32_VMX_PINBASED_CTLS: 1500 case MSR_IA32_VMX_PROCBASED_CTLS: 1501 case MSR_IA32_VMX_EXIT_CTLS: 1502 case MSR_IA32_VMX_ENTRY_CTLS: 1503 /* 1504 * The "non-true" VMX capability MSRs are generated from the 1505 * "true" MSRs, so we do not support restoring them directly. 1506 * 1507 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1508 * should restore the "true" MSRs with the must-be-1 bits 1509 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1510 * DEFAULT SETTINGS". 1511 */ 1512 return -EINVAL; 1513 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1514 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1515 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1516 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1517 case MSR_IA32_VMX_PROCBASED_CTLS2: 1518 return vmx_restore_control_msr(vmx, msr_index, data); 1519 case MSR_IA32_VMX_MISC: 1520 return vmx_restore_vmx_misc(vmx, data); 1521 case MSR_IA32_VMX_CR0_FIXED0: 1522 case MSR_IA32_VMX_CR4_FIXED0: 1523 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1524 case MSR_IA32_VMX_CR0_FIXED1: 1525 case MSR_IA32_VMX_CR4_FIXED1: 1526 /* 1527 * These MSRs are generated based on the vCPU's CPUID, so we 1528 * do not support restoring them directly. 1529 */ 1530 return -EINVAL; 1531 case MSR_IA32_VMX_EPT_VPID_CAP: 1532 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1533 case MSR_IA32_VMX_VMCS_ENUM: 1534 vmx->nested.msrs.vmcs_enum = data; 1535 return 0; 1536 case MSR_IA32_VMX_VMFUNC: 1537 if (data & ~vmcs_config.nested.vmfunc_controls) 1538 return -EINVAL; 1539 vmx->nested.msrs.vmfunc_controls = data; 1540 return 0; 1541 default: 1542 /* 1543 * The rest of the VMX capability MSRs do not support restore. 1544 */ 1545 return -EINVAL; 1546 } 1547 } 1548 1549 /* Returns 0 on success, non-0 otherwise. */ 1550 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1551 { 1552 switch (msr_index) { 1553 case MSR_IA32_VMX_BASIC: 1554 *pdata = msrs->basic; 1555 break; 1556 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1557 case MSR_IA32_VMX_PINBASED_CTLS: 1558 *pdata = vmx_control_msr( 1559 msrs->pinbased_ctls_low, 1560 msrs->pinbased_ctls_high); 1561 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1562 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1563 break; 1564 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1565 case MSR_IA32_VMX_PROCBASED_CTLS: 1566 *pdata = vmx_control_msr( 1567 msrs->procbased_ctls_low, 1568 msrs->procbased_ctls_high); 1569 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1570 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1571 break; 1572 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1573 case MSR_IA32_VMX_EXIT_CTLS: 1574 *pdata = vmx_control_msr( 1575 msrs->exit_ctls_low, 1576 msrs->exit_ctls_high); 1577 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1578 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1579 break; 1580 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1581 case MSR_IA32_VMX_ENTRY_CTLS: 1582 *pdata = vmx_control_msr( 1583 msrs->entry_ctls_low, 1584 msrs->entry_ctls_high); 1585 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1586 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1587 break; 1588 case MSR_IA32_VMX_MISC: 1589 *pdata = vmx_control_msr( 1590 msrs->misc_low, 1591 msrs->misc_high); 1592 break; 1593 case MSR_IA32_VMX_CR0_FIXED0: 1594 *pdata = msrs->cr0_fixed0; 1595 break; 1596 case MSR_IA32_VMX_CR0_FIXED1: 1597 *pdata = msrs->cr0_fixed1; 1598 break; 1599 case MSR_IA32_VMX_CR4_FIXED0: 1600 *pdata = msrs->cr4_fixed0; 1601 break; 1602 case MSR_IA32_VMX_CR4_FIXED1: 1603 *pdata = msrs->cr4_fixed1; 1604 break; 1605 case MSR_IA32_VMX_VMCS_ENUM: 1606 *pdata = msrs->vmcs_enum; 1607 break; 1608 case MSR_IA32_VMX_PROCBASED_CTLS2: 1609 *pdata = vmx_control_msr( 1610 msrs->secondary_ctls_low, 1611 msrs->secondary_ctls_high); 1612 break; 1613 case MSR_IA32_VMX_EPT_VPID_CAP: 1614 *pdata = msrs->ept_caps | 1615 ((u64)msrs->vpid_caps << 32); 1616 break; 1617 case MSR_IA32_VMX_VMFUNC: 1618 *pdata = msrs->vmfunc_controls; 1619 break; 1620 default: 1621 return 1; 1622 } 1623 1624 return 0; 1625 } 1626 1627 /* 1628 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1629 * been modified by the L1 guest. Note, "writable" in this context means 1630 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1631 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1632 * VM-exit information fields (which are actually writable if the vCPU is 1633 * configured to support "VMWRITE to any supported field in the VMCS"). 1634 */ 1635 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1636 { 1637 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1638 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1639 struct shadow_vmcs_field field; 1640 unsigned long val; 1641 int i; 1642 1643 if (WARN_ON(!shadow_vmcs)) 1644 return; 1645 1646 preempt_disable(); 1647 1648 vmcs_load(shadow_vmcs); 1649 1650 for (i = 0; i < max_shadow_read_write_fields; i++) { 1651 field = shadow_read_write_fields[i]; 1652 val = __vmcs_readl(field.encoding); 1653 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1654 } 1655 1656 vmcs_clear(shadow_vmcs); 1657 vmcs_load(vmx->loaded_vmcs->vmcs); 1658 1659 preempt_enable(); 1660 } 1661 1662 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1663 { 1664 const struct shadow_vmcs_field *fields[] = { 1665 shadow_read_write_fields, 1666 shadow_read_only_fields 1667 }; 1668 const int max_fields[] = { 1669 max_shadow_read_write_fields, 1670 max_shadow_read_only_fields 1671 }; 1672 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1673 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1674 struct shadow_vmcs_field field; 1675 unsigned long val; 1676 int i, q; 1677 1678 if (WARN_ON(!shadow_vmcs)) 1679 return; 1680 1681 vmcs_load(shadow_vmcs); 1682 1683 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1684 for (i = 0; i < max_fields[q]; i++) { 1685 field = fields[q][i]; 1686 val = vmcs12_read_any(vmcs12, field.encoding, 1687 field.offset); 1688 __vmcs_writel(field.encoding, val); 1689 } 1690 } 1691 1692 vmcs_clear(shadow_vmcs); 1693 vmcs_load(vmx->loaded_vmcs->vmcs); 1694 } 1695 1696 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1697 { 1698 #ifdef CONFIG_KVM_HYPERV 1699 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1700 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1701 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1702 1703 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1704 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1705 vmcs12->guest_rip = evmcs->guest_rip; 1706 1707 if (unlikely(!(hv_clean_fields & 1708 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1709 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1710 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1711 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1712 } 1713 1714 if (unlikely(!(hv_clean_fields & 1715 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1716 vmcs12->guest_rsp = evmcs->guest_rsp; 1717 vmcs12->guest_rflags = evmcs->guest_rflags; 1718 vmcs12->guest_interruptibility_info = 1719 evmcs->guest_interruptibility_info; 1720 /* 1721 * Not present in struct vmcs12: 1722 * vmcs12->guest_ssp = evmcs->guest_ssp; 1723 */ 1724 } 1725 1726 if (unlikely(!(hv_clean_fields & 1727 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1728 vmcs12->cpu_based_vm_exec_control = 1729 evmcs->cpu_based_vm_exec_control; 1730 } 1731 1732 if (unlikely(!(hv_clean_fields & 1733 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1734 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1735 } 1736 1737 if (unlikely(!(hv_clean_fields & 1738 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1739 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1740 } 1741 1742 if (unlikely(!(hv_clean_fields & 1743 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1744 vmcs12->vm_entry_intr_info_field = 1745 evmcs->vm_entry_intr_info_field; 1746 vmcs12->vm_entry_exception_error_code = 1747 evmcs->vm_entry_exception_error_code; 1748 vmcs12->vm_entry_instruction_len = 1749 evmcs->vm_entry_instruction_len; 1750 } 1751 1752 if (unlikely(!(hv_clean_fields & 1753 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1754 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1755 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1756 vmcs12->host_cr0 = evmcs->host_cr0; 1757 vmcs12->host_cr3 = evmcs->host_cr3; 1758 vmcs12->host_cr4 = evmcs->host_cr4; 1759 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1760 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1761 vmcs12->host_rip = evmcs->host_rip; 1762 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1763 vmcs12->host_es_selector = evmcs->host_es_selector; 1764 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1765 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1766 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1767 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1768 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1769 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1770 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1771 /* 1772 * Not present in struct vmcs12: 1773 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1774 * vmcs12->host_ssp = evmcs->host_ssp; 1775 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1776 */ 1777 } 1778 1779 if (unlikely(!(hv_clean_fields & 1780 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1781 vmcs12->pin_based_vm_exec_control = 1782 evmcs->pin_based_vm_exec_control; 1783 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1784 vmcs12->secondary_vm_exec_control = 1785 evmcs->secondary_vm_exec_control; 1786 } 1787 1788 if (unlikely(!(hv_clean_fields & 1789 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1790 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1791 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1792 } 1793 1794 if (unlikely(!(hv_clean_fields & 1795 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1796 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1797 } 1798 1799 if (unlikely(!(hv_clean_fields & 1800 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1801 vmcs12->guest_es_base = evmcs->guest_es_base; 1802 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1803 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1804 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1805 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1806 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1807 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1808 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1809 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1810 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1811 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1812 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1813 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1814 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1815 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1816 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1817 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1818 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1819 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1820 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1821 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1822 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1823 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1824 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1825 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1826 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1827 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1828 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1829 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1830 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1831 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1832 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1833 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1834 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1835 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1836 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1837 } 1838 1839 if (unlikely(!(hv_clean_fields & 1840 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1841 vmcs12->tsc_offset = evmcs->tsc_offset; 1842 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1843 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1844 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1845 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1846 } 1847 1848 if (unlikely(!(hv_clean_fields & 1849 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1850 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1851 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1852 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1853 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1854 vmcs12->guest_cr0 = evmcs->guest_cr0; 1855 vmcs12->guest_cr3 = evmcs->guest_cr3; 1856 vmcs12->guest_cr4 = evmcs->guest_cr4; 1857 vmcs12->guest_dr7 = evmcs->guest_dr7; 1858 } 1859 1860 if (unlikely(!(hv_clean_fields & 1861 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1862 vmcs12->host_fs_base = evmcs->host_fs_base; 1863 vmcs12->host_gs_base = evmcs->host_gs_base; 1864 vmcs12->host_tr_base = evmcs->host_tr_base; 1865 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1866 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1867 vmcs12->host_rsp = evmcs->host_rsp; 1868 } 1869 1870 if (unlikely(!(hv_clean_fields & 1871 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1872 vmcs12->ept_pointer = evmcs->ept_pointer; 1873 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1874 } 1875 1876 if (unlikely(!(hv_clean_fields & 1877 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1878 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1879 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1880 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1881 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1882 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1883 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1884 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1885 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1886 vmcs12->guest_pending_dbg_exceptions = 1887 evmcs->guest_pending_dbg_exceptions; 1888 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1889 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1890 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1891 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1892 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1893 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1894 /* 1895 * Not present in struct vmcs12: 1896 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1897 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1898 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1899 */ 1900 } 1901 1902 /* 1903 * Not used? 1904 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1905 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1906 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1907 * vmcs12->page_fault_error_code_mask = 1908 * evmcs->page_fault_error_code_mask; 1909 * vmcs12->page_fault_error_code_match = 1910 * evmcs->page_fault_error_code_match; 1911 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1912 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1913 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1914 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1915 */ 1916 1917 /* 1918 * Read only fields: 1919 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1920 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1921 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1922 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1923 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1924 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1925 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1926 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1927 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1928 * vmcs12->exit_qualification = evmcs->exit_qualification; 1929 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1930 * 1931 * Not present in struct vmcs12: 1932 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1933 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1934 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1935 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1936 */ 1937 1938 return; 1939 #else /* CONFIG_KVM_HYPERV */ 1940 KVM_BUG_ON(1, vmx->vcpu.kvm); 1941 #endif /* CONFIG_KVM_HYPERV */ 1942 } 1943 1944 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1945 { 1946 #ifdef CONFIG_KVM_HYPERV 1947 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1948 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1949 1950 /* 1951 * Should not be changed by KVM: 1952 * 1953 * evmcs->host_es_selector = vmcs12->host_es_selector; 1954 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1955 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1956 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1957 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1958 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1959 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1960 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1961 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1962 * evmcs->host_cr0 = vmcs12->host_cr0; 1963 * evmcs->host_cr3 = vmcs12->host_cr3; 1964 * evmcs->host_cr4 = vmcs12->host_cr4; 1965 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1966 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1967 * evmcs->host_rip = vmcs12->host_rip; 1968 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1969 * evmcs->host_fs_base = vmcs12->host_fs_base; 1970 * evmcs->host_gs_base = vmcs12->host_gs_base; 1971 * evmcs->host_tr_base = vmcs12->host_tr_base; 1972 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1973 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1974 * evmcs->host_rsp = vmcs12->host_rsp; 1975 * sync_vmcs02_to_vmcs12() doesn't read these: 1976 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1977 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1978 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1979 * evmcs->ept_pointer = vmcs12->ept_pointer; 1980 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1981 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1982 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1983 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1984 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1985 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1986 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1987 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1988 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1989 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1990 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1991 * evmcs->page_fault_error_code_mask = 1992 * vmcs12->page_fault_error_code_mask; 1993 * evmcs->page_fault_error_code_match = 1994 * vmcs12->page_fault_error_code_match; 1995 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1996 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1997 * evmcs->tsc_offset = vmcs12->tsc_offset; 1998 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1999 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 2000 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 2001 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 2002 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 2003 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 2004 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 2005 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 2006 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 2007 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 2008 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 2009 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 2010 * 2011 * Not present in struct vmcs12: 2012 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 2013 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 2014 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 2015 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 2016 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 2017 * evmcs->host_ssp = vmcs12->host_ssp; 2018 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 2019 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 2020 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 2021 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 2022 * evmcs->guest_ssp = vmcs12->guest_ssp; 2023 */ 2024 2025 evmcs->guest_es_selector = vmcs12->guest_es_selector; 2026 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 2027 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 2028 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 2029 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 2030 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 2031 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 2032 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 2033 2034 evmcs->guest_es_limit = vmcs12->guest_es_limit; 2035 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 2036 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 2037 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 2038 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 2039 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 2040 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 2041 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 2042 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 2043 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 2044 2045 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 2046 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 2047 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 2048 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 2049 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 2050 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 2051 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 2052 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 2053 2054 evmcs->guest_es_base = vmcs12->guest_es_base; 2055 evmcs->guest_cs_base = vmcs12->guest_cs_base; 2056 evmcs->guest_ss_base = vmcs12->guest_ss_base; 2057 evmcs->guest_ds_base = vmcs12->guest_ds_base; 2058 evmcs->guest_fs_base = vmcs12->guest_fs_base; 2059 evmcs->guest_gs_base = vmcs12->guest_gs_base; 2060 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 2061 evmcs->guest_tr_base = vmcs12->guest_tr_base; 2062 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 2063 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 2064 2065 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 2066 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 2067 2068 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 2069 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 2070 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 2071 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 2072 2073 evmcs->guest_pending_dbg_exceptions = 2074 vmcs12->guest_pending_dbg_exceptions; 2075 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 2076 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 2077 2078 evmcs->guest_activity_state = vmcs12->guest_activity_state; 2079 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 2080 2081 evmcs->guest_cr0 = vmcs12->guest_cr0; 2082 evmcs->guest_cr3 = vmcs12->guest_cr3; 2083 evmcs->guest_cr4 = vmcs12->guest_cr4; 2084 evmcs->guest_dr7 = vmcs12->guest_dr7; 2085 2086 evmcs->guest_physical_address = vmcs12->guest_physical_address; 2087 2088 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 2089 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 2090 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 2091 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 2092 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 2093 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 2094 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 2095 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 2096 2097 evmcs->exit_qualification = vmcs12->exit_qualification; 2098 2099 evmcs->guest_linear_address = vmcs12->guest_linear_address; 2100 evmcs->guest_rsp = vmcs12->guest_rsp; 2101 evmcs->guest_rflags = vmcs12->guest_rflags; 2102 2103 evmcs->guest_interruptibility_info = 2104 vmcs12->guest_interruptibility_info; 2105 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 2106 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 2107 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 2108 evmcs->vm_entry_exception_error_code = 2109 vmcs12->vm_entry_exception_error_code; 2110 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 2111 2112 evmcs->guest_rip = vmcs12->guest_rip; 2113 2114 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 2115 2116 return; 2117 #else /* CONFIG_KVM_HYPERV */ 2118 KVM_BUG_ON(1, vmx->vcpu.kvm); 2119 #endif /* CONFIG_KVM_HYPERV */ 2120 } 2121 2122 /* 2123 * This is an equivalent of the nested hypervisor executing the vmptrld 2124 * instruction. 2125 */ 2126 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2127 struct kvm_vcpu *vcpu, bool from_launch) 2128 { 2129 #ifdef CONFIG_KVM_HYPERV 2130 struct vcpu_vmx *vmx = to_vmx(vcpu); 2131 bool evmcs_gpa_changed = false; 2132 u64 evmcs_gpa; 2133 2134 if (likely(!guest_cpu_cap_has_evmcs(vcpu))) 2135 return EVMPTRLD_DISABLED; 2136 2137 evmcs_gpa = nested_get_evmptr(vcpu); 2138 if (!evmptr_is_valid(evmcs_gpa)) { 2139 nested_release_evmcs(vcpu); 2140 return EVMPTRLD_DISABLED; 2141 } 2142 2143 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2144 vmx->nested.current_vmptr = INVALID_GPA; 2145 2146 nested_release_evmcs(vcpu); 2147 2148 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2149 &vmx->nested.hv_evmcs_map)) 2150 return EVMPTRLD_ERROR; 2151 2152 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2153 2154 /* 2155 * Currently, KVM only supports eVMCS version 1 2156 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2157 * value to first u32 field of eVMCS which should specify eVMCS 2158 * VersionNumber. 2159 * 2160 * Guest should be aware of supported eVMCS versions by host by 2161 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2162 * expected to set this CPUID leaf according to the value 2163 * returned in vmcs_version from nested_enable_evmcs(). 2164 * 2165 * However, it turns out that Microsoft Hyper-V fails to comply 2166 * to their own invented interface: When Hyper-V use eVMCS, it 2167 * just sets first u32 field of eVMCS to revision_id specified 2168 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2169 * which is one of the supported versions specified in 2170 * CPUID.0x4000000A.EAX[0:15]. 2171 * 2172 * To overcome Hyper-V bug, we accept here either a supported 2173 * eVMCS version or VMCS12 revision_id as valid values for first 2174 * u32 field of eVMCS. 2175 */ 2176 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2177 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2178 nested_release_evmcs(vcpu); 2179 return EVMPTRLD_VMFAIL; 2180 } 2181 2182 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2183 2184 evmcs_gpa_changed = true; 2185 /* 2186 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2187 * reloaded from guest's memory (read only fields, fields not 2188 * present in struct hv_enlightened_vmcs, ...). Make sure there 2189 * are no leftovers. 2190 */ 2191 if (from_launch) { 2192 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2193 memset(vmcs12, 0, sizeof(*vmcs12)); 2194 vmcs12->hdr.revision_id = VMCS12_REVISION; 2195 } 2196 2197 } 2198 2199 /* 2200 * Clean fields data can't be used on VMLAUNCH and when we switch 2201 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2202 */ 2203 if (from_launch || evmcs_gpa_changed) { 2204 vmx->nested.hv_evmcs->hv_clean_fields &= 2205 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2206 2207 vmx->nested.force_msr_bitmap_recalc = true; 2208 } 2209 2210 return EVMPTRLD_SUCCEEDED; 2211 #else 2212 return EVMPTRLD_DISABLED; 2213 #endif 2214 } 2215 2216 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2217 { 2218 struct vcpu_vmx *vmx = to_vmx(vcpu); 2219 2220 if (nested_vmx_is_evmptr12_valid(vmx)) 2221 copy_vmcs12_to_enlightened(vmx); 2222 else 2223 copy_vmcs12_to_shadow(vmx); 2224 2225 vmx->nested.need_vmcs12_to_shadow_sync = false; 2226 } 2227 2228 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2229 { 2230 struct vcpu_vmx *vmx = 2231 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2232 2233 vmx->nested.preemption_timer_expired = true; 2234 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2235 kvm_vcpu_kick(&vmx->vcpu); 2236 2237 return HRTIMER_NORESTART; 2238 } 2239 2240 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2241 { 2242 struct vcpu_vmx *vmx = to_vmx(vcpu); 2243 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2244 2245 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2246 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2247 2248 if (!vmx->nested.has_preemption_timer_deadline) { 2249 vmx->nested.preemption_timer_deadline = 2250 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2251 vmx->nested.has_preemption_timer_deadline = true; 2252 } 2253 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2254 } 2255 2256 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2257 u64 preemption_timeout) 2258 { 2259 struct vcpu_vmx *vmx = to_vmx(vcpu); 2260 2261 /* 2262 * A timer value of zero is architecturally guaranteed to cause 2263 * a VMExit prior to executing any instructions in the guest. 2264 */ 2265 if (preemption_timeout == 0) { 2266 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2267 return; 2268 } 2269 2270 if (vcpu->arch.virtual_tsc_khz == 0) 2271 return; 2272 2273 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2274 preemption_timeout *= 1000000; 2275 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2276 hrtimer_start(&vmx->nested.preemption_timer, 2277 ktime_add_ns(ktime_get(), preemption_timeout), 2278 HRTIMER_MODE_ABS_PINNED); 2279 } 2280 2281 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2282 { 2283 if (vmx->nested.nested_run_pending && 2284 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2285 return vmcs12->guest_ia32_efer; 2286 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2287 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2288 else 2289 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2290 } 2291 2292 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2293 { 2294 struct kvm *kvm = vmx->vcpu.kvm; 2295 2296 /* 2297 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2298 * according to L0's settings (vmcs12 is irrelevant here). Host 2299 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2300 * will be set as needed prior to VMLAUNCH/VMRESUME. 2301 */ 2302 if (vmx->nested.vmcs02_initialized) 2303 return; 2304 vmx->nested.vmcs02_initialized = true; 2305 2306 if (vmx->ve_info) 2307 vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); 2308 2309 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2310 if (cpu_has_vmx_vmfunc()) 2311 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2312 2313 if (cpu_has_vmx_posted_intr()) 2314 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2315 2316 if (cpu_has_vmx_msr_bitmap()) 2317 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2318 2319 /* 2320 * PML is emulated for L2, but never enabled in hardware as the MMU 2321 * handles A/D emulation. Disabling PML for L2 also avoids having to 2322 * deal with filtering out L2 GPAs from the buffer. 2323 */ 2324 if (enable_pml) { 2325 vmcs_write64(PML_ADDRESS, 0); 2326 vmcs_write16(GUEST_PML_INDEX, -1); 2327 } 2328 2329 if (cpu_has_vmx_encls_vmexit()) 2330 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2331 2332 if (kvm_notify_vmexit_enabled(kvm)) 2333 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2334 2335 /* 2336 * Set the MSR load/store lists to match L0's settings. Only the 2337 * addresses are constant (for vmcs02), the counts can change based 2338 * on L2's behavior, e.g. switching to/from long mode. 2339 */ 2340 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2341 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2342 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2343 2344 vmx_set_constant_host_state(vmx); 2345 } 2346 2347 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2348 struct vmcs12 *vmcs12) 2349 { 2350 prepare_vmcs02_constant_state(vmx); 2351 2352 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2353 2354 /* 2355 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 2356 * same VPID as the host. Emulate this behavior by using vpid01 for L2 2357 * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter 2358 * and VM-Exit are architecturally required to flush VPID=0, but *only* 2359 * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the 2360 * required flushes), but doing so would cause KVM to over-flush. E.g. 2361 * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, 2362 * and then runs L2 X again, then KVM can and should retain TLB entries 2363 * for VPID12=1. 2364 */ 2365 if (enable_vpid) { 2366 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2367 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2368 else 2369 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2370 } 2371 } 2372 2373 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2374 struct vmcs12 *vmcs12) 2375 { 2376 u32 exec_control; 2377 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2378 2379 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) 2380 prepare_vmcs02_early_rare(vmx, vmcs12); 2381 2382 /* 2383 * PIN CONTROLS 2384 */ 2385 exec_control = __pin_controls_get(vmcs01); 2386 exec_control |= (vmcs12->pin_based_vm_exec_control & 2387 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2388 2389 /* Posted interrupts setting is only taken from vmcs12. */ 2390 vmx->nested.pi_pending = false; 2391 if (nested_cpu_has_posted_intr(vmcs12)) { 2392 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2393 } else { 2394 vmx->nested.posted_intr_nv = -1; 2395 exec_control &= ~PIN_BASED_POSTED_INTR; 2396 } 2397 pin_controls_set(vmx, exec_control); 2398 2399 /* 2400 * EXEC CONTROLS 2401 */ 2402 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2403 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2404 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2405 exec_control &= ~CPU_BASED_TPR_SHADOW; 2406 exec_control |= vmcs12->cpu_based_vm_exec_control; 2407 2408 vmx->nested.l1_tpr_threshold = -1; 2409 if (exec_control & CPU_BASED_TPR_SHADOW) 2410 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2411 #ifdef CONFIG_X86_64 2412 else 2413 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2414 CPU_BASED_CR8_STORE_EXITING; 2415 #endif 2416 2417 /* 2418 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2419 * for I/O port accesses. 2420 */ 2421 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2422 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2423 2424 /* 2425 * This bit will be computed in nested_get_vmcs12_pages, because 2426 * we do not have access to L1's MSR bitmap yet. For now, keep 2427 * the same bit as before, hoping to avoid multiple VMWRITEs that 2428 * only set/clear this bit. 2429 */ 2430 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2431 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2432 2433 exec_controls_set(vmx, exec_control); 2434 2435 /* 2436 * SECONDARY EXEC CONTROLS 2437 */ 2438 if (cpu_has_secondary_exec_ctrls()) { 2439 exec_control = __secondary_exec_controls_get(vmcs01); 2440 2441 /* Take the following fields only from vmcs12 */ 2442 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2443 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2444 SECONDARY_EXEC_ENABLE_INVPCID | 2445 SECONDARY_EXEC_ENABLE_RDTSCP | 2446 SECONDARY_EXEC_ENABLE_XSAVES | 2447 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2448 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2449 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2450 SECONDARY_EXEC_ENABLE_VMFUNC | 2451 SECONDARY_EXEC_DESC); 2452 2453 if (nested_cpu_has(vmcs12, 2454 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2455 exec_control |= vmcs12->secondary_vm_exec_control; 2456 2457 /* PML is emulated and never enabled in hardware for L2. */ 2458 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2459 2460 /* VMCS shadowing for L2 is emulated for now */ 2461 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2462 2463 /* 2464 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2465 * will not have to rewrite the controls just for this bit. 2466 */ 2467 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2468 exec_control |= SECONDARY_EXEC_DESC; 2469 2470 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2471 vmcs_write16(GUEST_INTR_STATUS, 2472 vmcs12->guest_intr_status); 2473 2474 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2475 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2476 2477 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2478 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2479 2480 secondary_exec_controls_set(vmx, exec_control); 2481 } 2482 2483 /* 2484 * ENTRY CONTROLS 2485 * 2486 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2487 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2488 * on the related bits (if supported by the CPU) in the hope that 2489 * we can avoid VMWrites during vmx_set_efer(). 2490 * 2491 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2492 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2493 * do the same for L2. 2494 */ 2495 exec_control = __vm_entry_controls_get(vmcs01); 2496 exec_control |= (vmcs12->vm_entry_controls & 2497 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2498 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2499 if (cpu_has_load_ia32_efer()) { 2500 if (guest_efer & EFER_LMA) 2501 exec_control |= VM_ENTRY_IA32E_MODE; 2502 if (guest_efer != kvm_host.efer) 2503 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2504 } 2505 vm_entry_controls_set(vmx, exec_control); 2506 2507 /* 2508 * EXIT CONTROLS 2509 * 2510 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2511 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2512 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2513 */ 2514 exec_control = __vm_exit_controls_get(vmcs01); 2515 if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) 2516 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2517 else 2518 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2519 vm_exit_controls_set(vmx, exec_control); 2520 2521 /* 2522 * Interrupt/Exception Fields 2523 */ 2524 if (vmx->nested.nested_run_pending) { 2525 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2526 vmcs12->vm_entry_intr_info_field); 2527 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2528 vmcs12->vm_entry_exception_error_code); 2529 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2530 vmcs12->vm_entry_instruction_len); 2531 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2532 vmcs12->guest_interruptibility_info); 2533 vmx->loaded_vmcs->nmi_known_unmasked = 2534 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2535 } else { 2536 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2537 } 2538 } 2539 2540 static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet, 2541 u64 *ssp, u64 *ssp_tbl) 2542 { 2543 if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2544 guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2545 *s_cet = vmcs_readl(GUEST_S_CET); 2546 2547 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2548 *ssp = vmcs_readl(GUEST_SSP); 2549 *ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE); 2550 } 2551 } 2552 2553 static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, 2554 u64 ssp, u64 ssp_tbl) 2555 { 2556 if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2557 guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2558 vmcs_writel(GUEST_S_CET, s_cet); 2559 2560 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2561 vmcs_writel(GUEST_SSP, ssp); 2562 vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl); 2563 } 2564 } 2565 2566 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2567 { 2568 struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); 2569 2570 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2571 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2572 2573 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2574 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2575 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2576 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2577 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2578 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2579 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2580 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2581 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2582 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2583 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2584 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2585 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2586 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2587 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2588 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2589 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2590 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2591 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2592 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2593 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2594 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2595 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2596 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2597 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2598 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2599 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2600 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2601 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2602 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2603 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2604 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2605 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2606 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2607 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2608 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2609 2610 vmx_segment_cache_clear(vmx); 2611 } 2612 2613 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2614 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2615 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2616 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2617 vmcs12->guest_pending_dbg_exceptions); 2618 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2619 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2620 2621 /* 2622 * L1 may access the L2's PDPTR, so save them to construct 2623 * vmcs12 2624 */ 2625 if (enable_ept) { 2626 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2627 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2628 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2629 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2630 } 2631 2632 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2633 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2634 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2635 } 2636 2637 if (nested_cpu_has_xsaves(vmcs12)) 2638 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2639 2640 /* 2641 * Whether page-faults are trapped is determined by a combination of 2642 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2643 * doesn't care about page faults then we should set all of these to 2644 * L1's desires. However, if L0 does care about (some) page faults, it 2645 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2646 * simply ask to exit on each and every L2 page fault. This is done by 2647 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2648 * Note that below we don't need special code to set EB.PF beyond the 2649 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2650 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2651 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2652 */ 2653 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2654 /* 2655 * TODO: if both L0 and L1 need the same MASK and MATCH, 2656 * go ahead and use it? 2657 */ 2658 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2659 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2660 } else { 2661 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2662 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2663 } 2664 2665 if (cpu_has_vmx_apicv()) { 2666 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2667 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2668 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2669 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2670 } 2671 2672 /* 2673 * Make sure the msr_autostore list is up to date before we set the 2674 * count in the vmcs02. 2675 */ 2676 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2677 2678 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2679 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2680 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2681 2682 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) 2683 vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet, 2684 vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); 2685 2686 set_cr4_guest_host_mask(vmx); 2687 } 2688 2689 /* 2690 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2691 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2692 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2693 * guest in a way that will both be appropriate to L1's requests, and our 2694 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2695 * function also has additional necessary side-effects, like setting various 2696 * vcpu->arch fields. 2697 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2698 * is assigned to entry_failure_code on failure. 2699 */ 2700 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2701 bool from_vmentry, 2702 enum vm_entry_failure_code *entry_failure_code) 2703 { 2704 struct vcpu_vmx *vmx = to_vmx(vcpu); 2705 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 2706 bool load_guest_pdptrs_vmcs12 = false; 2707 2708 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { 2709 prepare_vmcs02_rare(vmx, vmcs12); 2710 vmx->nested.dirty_vmcs12 = false; 2711 2712 load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || 2713 !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2714 } 2715 2716 if (vmx->nested.nested_run_pending && 2717 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2718 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2719 vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & 2720 vmx_get_supported_debugctl(vcpu, false)); 2721 } else { 2722 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2723 vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); 2724 } 2725 2726 if (!vmx->nested.nested_run_pending || 2727 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 2728 vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, 2729 vmx->nested.pre_vmenter_ssp, 2730 vmx->nested.pre_vmenter_ssp_tbl); 2731 2732 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2733 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2734 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2735 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2736 2737 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2738 * bitwise-or of what L1 wants to trap for L2, and what we want to 2739 * trap. Note that CR0.TS also needs updating - we do this later. 2740 */ 2741 vmx_update_exception_bitmap(vcpu); 2742 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2743 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2744 2745 if (vmx->nested.nested_run_pending && 2746 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2747 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2748 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2749 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2750 vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat); 2751 } 2752 2753 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2754 vcpu->arch.l1_tsc_offset, 2755 vmx_get_l2_tsc_offset(vcpu), 2756 vmx_get_l2_tsc_multiplier(vcpu)); 2757 2758 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2759 vcpu->arch.l1_tsc_scaling_ratio, 2760 vmx_get_l2_tsc_multiplier(vcpu)); 2761 2762 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2763 if (kvm_caps.has_tsc_control) 2764 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2765 2766 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2767 2768 if (nested_cpu_has_ept(vmcs12)) 2769 nested_ept_init_mmu_context(vcpu); 2770 2771 /* 2772 * Override the CR0/CR4 read shadows after setting the effective guest 2773 * CR0/CR4. The common helpers also set the shadows, but they don't 2774 * account for vmcs12's cr0/4_guest_host_mask. 2775 */ 2776 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2777 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2778 2779 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2780 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2781 2782 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2783 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2784 vmx_set_efer(vcpu, vcpu->arch.efer); 2785 2786 /* 2787 * Guest state is invalid and unrestricted guest is disabled, 2788 * which means L1 attempted VMEntry to L2 with invalid state. 2789 * Fail the VMEntry. 2790 * 2791 * However when force loading the guest state (SMM exit or 2792 * loading nested state after migration, it is possible to 2793 * have invalid guest state now, which will be later fixed by 2794 * restoring L2 register state 2795 */ 2796 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2797 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2798 return -EINVAL; 2799 } 2800 2801 /* Shadow page tables on either EPT or shadow page tables. */ 2802 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2803 from_vmentry, entry_failure_code)) 2804 return -EINVAL; 2805 2806 /* 2807 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2808 * on nested VM-Exit, which can occur without actually running L2 and 2809 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2810 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2811 * transition to HLT instead of running L2. 2812 */ 2813 if (enable_ept) 2814 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2815 2816 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2817 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2818 is_pae_paging(vcpu)) { 2819 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2820 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2821 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2822 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2823 } 2824 2825 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2826 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2827 WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2828 vmcs12->guest_ia32_perf_global_ctrl))) { 2829 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2830 return -EINVAL; 2831 } 2832 2833 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2834 kvm_rip_write(vcpu, vmcs12->guest_rip); 2835 2836 /* 2837 * It was observed that genuine Hyper-V running in L1 doesn't reset 2838 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2839 * bits when it changes a field in eVMCS. Mark all fields as clean 2840 * here. 2841 */ 2842 if (nested_vmx_is_evmptr12_valid(vmx)) 2843 evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2844 2845 return 0; 2846 } 2847 2848 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2849 { 2850 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2851 nested_cpu_has_virtual_nmis(vmcs12))) 2852 return -EINVAL; 2853 2854 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2855 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2856 return -EINVAL; 2857 2858 return 0; 2859 } 2860 2861 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2862 { 2863 struct vcpu_vmx *vmx = to_vmx(vcpu); 2864 2865 /* Check for memory type validity */ 2866 switch (new_eptp & VMX_EPTP_MT_MASK) { 2867 case VMX_EPTP_MT_UC: 2868 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2869 return false; 2870 break; 2871 case VMX_EPTP_MT_WB: 2872 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2873 return false; 2874 break; 2875 default: 2876 return false; 2877 } 2878 2879 /* Page-walk levels validity. */ 2880 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2881 case VMX_EPTP_PWL_5: 2882 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2883 return false; 2884 break; 2885 case VMX_EPTP_PWL_4: 2886 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2887 return false; 2888 break; 2889 default: 2890 return false; 2891 } 2892 2893 /* Reserved bits should not be set */ 2894 if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2895 return false; 2896 2897 /* AD, if set, should be supported */ 2898 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2899 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2900 return false; 2901 } 2902 2903 return true; 2904 } 2905 2906 /* 2907 * Checks related to VM-Execution Control Fields 2908 */ 2909 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2910 struct vmcs12 *vmcs12) 2911 { 2912 struct vcpu_vmx *vmx = to_vmx(vcpu); 2913 2914 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2915 vmx->nested.msrs.pinbased_ctls_low, 2916 vmx->nested.msrs.pinbased_ctls_high)) || 2917 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2918 vmx->nested.msrs.procbased_ctls_low, 2919 vmx->nested.msrs.procbased_ctls_high))) 2920 return -EINVAL; 2921 2922 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2923 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2924 vmx->nested.msrs.secondary_ctls_low, 2925 vmx->nested.msrs.secondary_ctls_high))) 2926 return -EINVAL; 2927 2928 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2929 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2930 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2931 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2932 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2933 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2934 nested_vmx_check_nmi_controls(vmcs12) || 2935 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2936 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2937 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2938 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2939 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2940 return -EINVAL; 2941 2942 if (!nested_cpu_has_preemption_timer(vmcs12) && 2943 nested_cpu_has_save_preemption_timer(vmcs12)) 2944 return -EINVAL; 2945 2946 if (nested_cpu_has_ept(vmcs12) && 2947 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2948 return -EINVAL; 2949 2950 if (nested_cpu_has_vmfunc(vmcs12)) { 2951 if (CC(vmcs12->vm_function_control & 2952 ~vmx->nested.msrs.vmfunc_controls)) 2953 return -EINVAL; 2954 2955 if (nested_cpu_has_eptp_switching(vmcs12)) { 2956 if (CC(!nested_cpu_has_ept(vmcs12)) || 2957 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2958 return -EINVAL; 2959 } 2960 } 2961 2962 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) && 2963 CC(!vmcs12->tsc_multiplier)) 2964 return -EINVAL; 2965 2966 return 0; 2967 } 2968 2969 /* 2970 * Checks related to VM-Exit Control Fields 2971 */ 2972 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2973 struct vmcs12 *vmcs12) 2974 { 2975 struct vcpu_vmx *vmx = to_vmx(vcpu); 2976 2977 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2978 vmx->nested.msrs.exit_ctls_low, 2979 vmx->nested.msrs.exit_ctls_high)) || 2980 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2981 return -EINVAL; 2982 2983 return 0; 2984 } 2985 2986 /* 2987 * Checks related to VM-Entry Control Fields 2988 */ 2989 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2990 struct vmcs12 *vmcs12) 2991 { 2992 struct vcpu_vmx *vmx = to_vmx(vcpu); 2993 2994 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2995 vmx->nested.msrs.entry_ctls_low, 2996 vmx->nested.msrs.entry_ctls_high))) 2997 return -EINVAL; 2998 2999 /* 3000 * From the Intel SDM, volume 3: 3001 * Fields relevant to VM-entry event injection must be set properly. 3002 * These fields are the VM-entry interruption-information field, the 3003 * VM-entry exception error code, and the VM-entry instruction length. 3004 */ 3005 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 3006 u32 intr_info = vmcs12->vm_entry_intr_info_field; 3007 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 3008 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 3009 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 3010 bool urg = nested_cpu_has2(vmcs12, 3011 SECONDARY_EXEC_UNRESTRICTED_GUEST); 3012 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 3013 3014 /* VM-entry interruption-info field: interruption type */ 3015 if (CC(intr_type == INTR_TYPE_RESERVED) || 3016 CC(intr_type == INTR_TYPE_OTHER_EVENT && 3017 !nested_cpu_supports_monitor_trap_flag(vcpu))) 3018 return -EINVAL; 3019 3020 /* VM-entry interruption-info field: vector */ 3021 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 3022 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 3023 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 3024 return -EINVAL; 3025 3026 /* 3027 * Cannot deliver error code in real mode or if the interrupt 3028 * type is not hardware exception. For other cases, do the 3029 * consistency check only if the vCPU doesn't enumerate 3030 * VMX_BASIC_NO_HW_ERROR_CODE_CC. 3031 */ 3032 if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) { 3033 if (CC(has_error_code)) 3034 return -EINVAL; 3035 } else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) { 3036 if (CC(has_error_code != x86_exception_has_error_code(vector))) 3037 return -EINVAL; 3038 } 3039 3040 /* VM-entry exception error code */ 3041 if (CC(has_error_code && 3042 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 3043 return -EINVAL; 3044 3045 /* VM-entry interruption-info field: reserved bits */ 3046 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 3047 return -EINVAL; 3048 3049 /* VM-entry instruction length */ 3050 switch (intr_type) { 3051 case INTR_TYPE_SOFT_EXCEPTION: 3052 case INTR_TYPE_SOFT_INTR: 3053 case INTR_TYPE_PRIV_SW_EXCEPTION: 3054 if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || 3055 CC(vmcs12->vm_entry_instruction_len == 0 && 3056 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 3057 return -EINVAL; 3058 } 3059 } 3060 3061 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 3062 return -EINVAL; 3063 3064 return 0; 3065 } 3066 3067 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 3068 struct vmcs12 *vmcs12) 3069 { 3070 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 3071 nested_check_vm_exit_controls(vcpu, vmcs12) || 3072 nested_check_vm_entry_controls(vcpu, vmcs12)) 3073 return -EINVAL; 3074 3075 #ifdef CONFIG_KVM_HYPERV 3076 if (guest_cpu_cap_has_evmcs(vcpu)) 3077 return nested_evmcs_check_controls(vmcs12); 3078 #endif 3079 3080 return 0; 3081 } 3082 3083 static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu, 3084 struct vmcs12 *vmcs12) 3085 { 3086 void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva; 3087 u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0; 3088 3089 /* 3090 * Don't bother with the consistency checks if KVM isn't configured to 3091 * WARN on missed consistency checks, as KVM needs to rely on hardware 3092 * to fully detect an illegal vTPR vs. TRP Threshold combination due to 3093 * the vTPR being writable by L1 at all times (it's an in-memory value, 3094 * not a VMCS field). I.e. even if the check passes now, it might fail 3095 * at the actual VM-Enter. 3096 * 3097 * Keying off the module param also allows treating an invalid vAPIC 3098 * mapping as a consistency check failure without increasing the risk 3099 * of breaking a "real" VM. 3100 */ 3101 if (!warn_on_missed_cc) 3102 return 0; 3103 3104 if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) && 3105 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) && 3106 !nested_cpu_has_vid(vmcs12) && 3107 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 3108 (CC(!vapic) || 3109 CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0))))) 3110 return -EINVAL; 3111 3112 return 0; 3113 } 3114 3115 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 3116 struct vmcs12 *vmcs12) 3117 { 3118 #ifdef CONFIG_X86_64 3119 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 3120 !!(vcpu->arch.efer & EFER_LMA))) 3121 return -EINVAL; 3122 #endif 3123 return 0; 3124 } 3125 3126 static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) 3127 { 3128 /* 3129 * Check that the given linear address is canonical after a VM exit 3130 * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. 3131 */ 3132 u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; 3133 3134 return !__is_canonical_address(la, l1_address_bits_on_exit); 3135 } 3136 3137 static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet, 3138 u64 ssp, u64 ssp_tbl) 3139 { 3140 if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) || 3141 CC(is_noncanonical_msr_address(ssp_tbl, vcpu))) 3142 return -EINVAL; 3143 3144 return 0; 3145 } 3146 3147 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 3148 struct vmcs12 *vmcs12) 3149 { 3150 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 3151 3152 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 3153 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 3154 CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) 3155 return -EINVAL; 3156 3157 if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP))) 3158 return -EINVAL; 3159 3160 if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 3161 CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 3162 return -EINVAL; 3163 3164 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 3165 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 3166 return -EINVAL; 3167 3168 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 3169 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3170 vmcs12->host_ia32_perf_global_ctrl))) 3171 return -EINVAL; 3172 3173 if (ia32e) { 3174 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 3175 return -EINVAL; 3176 } else { 3177 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 3178 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 3179 CC((vmcs12->host_rip) >> 32)) 3180 return -EINVAL; 3181 } 3182 3183 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3184 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3185 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3186 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3187 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3188 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3189 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3190 CC(vmcs12->host_cs_selector == 0) || 3191 CC(vmcs12->host_tr_selector == 0) || 3192 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 3193 return -EINVAL; 3194 3195 if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || 3196 CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || 3197 CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || 3198 CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || 3199 CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || 3200 CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) 3201 return -EINVAL; 3202 3203 /* 3204 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 3205 * IA32_EFER MSR must be 0 in the field for that register. In addition, 3206 * the values of the LMA and LME bits in the field must each be that of 3207 * the host address-space size VM-exit control. 3208 */ 3209 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 3210 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 3211 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 3212 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 3213 return -EINVAL; 3214 } 3215 3216 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) { 3217 if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet, 3218 vmcs12->host_ssp, 3219 vmcs12->host_ssp_tbl)) 3220 return -EINVAL; 3221 3222 /* 3223 * IA32_S_CET and SSP must be canonical if the host will 3224 * enter 64-bit mode after VM-exit; otherwise, higher 3225 * 32-bits must be all 0s. 3226 */ 3227 if (ia32e) { 3228 if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) || 3229 CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu))) 3230 return -EINVAL; 3231 } else { 3232 if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32)) 3233 return -EINVAL; 3234 } 3235 } 3236 3237 return 0; 3238 } 3239 3240 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 3241 struct vmcs12 *vmcs12) 3242 { 3243 struct vcpu_vmx *vmx = to_vmx(vcpu); 3244 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 3245 struct vmcs_hdr hdr; 3246 3247 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 3248 return 0; 3249 3250 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 3251 return -EINVAL; 3252 3253 if (ghc->gpa != vmcs12->vmcs_link_pointer && 3254 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 3255 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 3256 return -EINVAL; 3257 3258 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 3259 offsetof(struct vmcs12, hdr), 3260 sizeof(hdr)))) 3261 return -EINVAL; 3262 3263 if (CC(hdr.revision_id != VMCS12_REVISION) || 3264 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3265 return -EINVAL; 3266 3267 return 0; 3268 } 3269 3270 /* 3271 * Checks related to Guest Non-register State 3272 */ 3273 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3274 { 3275 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3276 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3277 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3278 return -EINVAL; 3279 3280 return 0; 3281 } 3282 3283 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3284 struct vmcs12 *vmcs12, 3285 enum vm_entry_failure_code *entry_failure_code) 3286 { 3287 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3288 3289 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3290 3291 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3292 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3293 return -EINVAL; 3294 3295 if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP))) 3296 return -EINVAL; 3297 3298 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3299 (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || 3300 CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) 3301 return -EINVAL; 3302 3303 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3304 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3305 return -EINVAL; 3306 3307 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3308 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3309 return -EINVAL; 3310 } 3311 3312 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3313 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3314 vmcs12->guest_ia32_perf_global_ctrl))) 3315 return -EINVAL; 3316 3317 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3318 return -EINVAL; 3319 3320 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3321 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3322 return -EINVAL; 3323 3324 /* 3325 * If the load IA32_EFER VM-entry control is 1, the following checks 3326 * are performed on the field for the IA32_EFER MSR: 3327 * - Bits reserved in the IA32_EFER MSR must be 0. 3328 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3329 * the IA-32e mode guest VM-exit control. It must also be identical 3330 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3331 * CR0.PG) is 1. 3332 */ 3333 if (to_vmx(vcpu)->nested.nested_run_pending && 3334 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3335 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3336 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3337 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3338 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3339 return -EINVAL; 3340 } 3341 3342 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3343 (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3344 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3345 return -EINVAL; 3346 3347 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) { 3348 if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet, 3349 vmcs12->guest_ssp, 3350 vmcs12->guest_ssp_tbl)) 3351 return -EINVAL; 3352 3353 /* 3354 * Guest SSP must have 63:N bits identical, rather than 3355 * be canonical (i.e., 63:N-1 bits identical), where N is 3356 * the CPU's maximum linear-address width. Similar to 3357 * is_noncanonical_msr_address(), use the host's 3358 * linear-address width. 3359 */ 3360 if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1))) 3361 return -EINVAL; 3362 } 3363 3364 if (nested_check_guest_non_reg_state(vmcs12)) 3365 return -EINVAL; 3366 3367 return 0; 3368 } 3369 3370 #ifdef CONFIG_KVM_HYPERV 3371 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3372 { 3373 struct vcpu_vmx *vmx = to_vmx(vcpu); 3374 3375 /* 3376 * hv_evmcs may end up being not mapped after migration (when 3377 * L2 was running), map it here to make sure vmcs12 changes are 3378 * properly reflected. 3379 */ 3380 if (guest_cpu_cap_has_evmcs(vcpu) && 3381 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3382 enum nested_evmptrld_status evmptrld_status = 3383 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3384 3385 if (evmptrld_status == EVMPTRLD_VMFAIL || 3386 evmptrld_status == EVMPTRLD_ERROR) 3387 return false; 3388 3389 /* 3390 * Post migration VMCS12 always provides the most actual 3391 * information, copy it to eVMCS upon entry. 3392 */ 3393 vmx->nested.need_vmcs12_to_shadow_sync = true; 3394 } 3395 3396 return true; 3397 } 3398 #endif 3399 3400 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3401 { 3402 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3403 struct vcpu_vmx *vmx = to_vmx(vcpu); 3404 struct kvm_host_map *map; 3405 3406 if (!vcpu->arch.pdptrs_from_userspace && 3407 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3408 /* 3409 * Reload the guest's PDPTRs since after a migration 3410 * the guest CR3 might be restored prior to setting the nested 3411 * state which can lead to a load of wrong PDPTRs. 3412 */ 3413 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3414 return false; 3415 } 3416 3417 3418 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3419 map = &vmx->nested.apic_access_page_map; 3420 3421 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3422 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3423 } else { 3424 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3425 __func__); 3426 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3427 vcpu->run->internal.suberror = 3428 KVM_INTERNAL_ERROR_EMULATION; 3429 vcpu->run->internal.ndata = 0; 3430 return false; 3431 } 3432 } 3433 3434 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3435 map = &vmx->nested.virtual_apic_map; 3436 3437 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3438 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3439 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3440 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3441 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3442 /* 3443 * The processor will never use the TPR shadow, simply 3444 * clear the bit from the execution control. Such a 3445 * configuration is useless, but it happens in tests. 3446 * For any other configuration, failing the vm entry is 3447 * _not_ what the processor does but it's basically the 3448 * only possibility we have. 3449 */ 3450 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3451 } else { 3452 /* 3453 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3454 * force VM-Entry to fail. 3455 */ 3456 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3457 } 3458 } 3459 3460 if (nested_cpu_has_posted_intr(vmcs12)) { 3461 map = &vmx->nested.pi_desc_map; 3462 3463 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3464 vmx->nested.pi_desc = 3465 (struct pi_desc *)(((void *)map->hva) + 3466 offset_in_page(vmcs12->posted_intr_desc_addr)); 3467 vmcs_write64(POSTED_INTR_DESC_ADDR, 3468 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3469 } else { 3470 /* 3471 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3472 * access the contents of the VMCS12 posted interrupt 3473 * descriptor. (Note that KVM may do this when it 3474 * should not, per the architectural specification.) 3475 */ 3476 vmx->nested.pi_desc = NULL; 3477 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3478 } 3479 } 3480 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3481 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3482 else 3483 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3484 3485 return true; 3486 } 3487 3488 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3489 { 3490 #ifdef CONFIG_KVM_HYPERV 3491 /* 3492 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3493 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3494 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3495 * migration. 3496 */ 3497 if (!nested_get_evmcs_page(vcpu)) { 3498 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3499 __func__); 3500 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3501 vcpu->run->internal.suberror = 3502 KVM_INTERNAL_ERROR_EMULATION; 3503 vcpu->run->internal.ndata = 0; 3504 3505 return false; 3506 } 3507 #endif 3508 3509 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3510 return false; 3511 3512 return true; 3513 } 3514 3515 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3516 { 3517 struct vmcs12 *vmcs12; 3518 struct vcpu_vmx *vmx = to_vmx(vcpu); 3519 gpa_t dst; 3520 3521 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3522 return 0; 3523 3524 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3525 return 1; 3526 3527 /* 3528 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3529 * set is already checked as part of A/D emulation. 3530 */ 3531 vmcs12 = get_vmcs12(vcpu); 3532 if (!nested_cpu_has_pml(vmcs12)) 3533 return 0; 3534 3535 if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { 3536 vmx->nested.pml_full = true; 3537 return 1; 3538 } 3539 3540 gpa &= ~0xFFFull; 3541 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3542 3543 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3544 offset_in_page(dst), sizeof(gpa))) 3545 return 0; 3546 3547 vmcs12->guest_pml_index--; 3548 3549 return 0; 3550 } 3551 3552 /* 3553 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3554 * for running VMX instructions (except VMXON, whose prerequisites are 3555 * slightly different). It also specifies what exception to inject otherwise. 3556 * Note that many of these exceptions have priority over VM exits, so they 3557 * don't have to be checked again here. 3558 */ 3559 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3560 { 3561 if (!to_vmx(vcpu)->nested.vmxon) { 3562 kvm_queue_exception(vcpu, UD_VECTOR); 3563 return 0; 3564 } 3565 3566 if (vmx_get_cpl(vcpu)) { 3567 kvm_inject_gp(vcpu, 0); 3568 return 0; 3569 } 3570 3571 return 1; 3572 } 3573 3574 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3575 struct vmcs12 *vmcs12); 3576 3577 /* 3578 * If from_vmentry is false, this is being called from state restore (either RSM 3579 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3580 * 3581 * Returns: 3582 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3583 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3584 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3585 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3586 */ 3587 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3588 bool from_vmentry) 3589 { 3590 struct vcpu_vmx *vmx = to_vmx(vcpu); 3591 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3592 enum vm_entry_failure_code entry_failure_code; 3593 union vmx_exit_reason exit_reason = { 3594 .basic = EXIT_REASON_INVALID_STATE, 3595 .failed_vmentry = 1, 3596 }; 3597 u32 failed_index; 3598 3599 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3600 vmx->nested.current_vmptr, 3601 vmcs12->guest_rip, 3602 vmcs12->guest_intr_status, 3603 vmcs12->vm_entry_intr_info_field, 3604 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3605 vmcs12->ept_pointer, 3606 vmcs12->guest_cr3, 3607 KVM_ISA_VMX); 3608 3609 kvm_service_local_tlb_flush_requests(vcpu); 3610 3611 if (!vmx->nested.nested_run_pending || 3612 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3613 vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); 3614 if (kvm_mpx_supported() && 3615 (!vmx->nested.nested_run_pending || 3616 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3617 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3618 3619 if (!vmx->nested.nested_run_pending || 3620 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 3621 vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, 3622 &vmx->nested.pre_vmenter_ssp, 3623 &vmx->nested.pre_vmenter_ssp_tbl); 3624 3625 /* 3626 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the 3627 * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but 3628 * not KVM, KVM must unwind its software model to the pre-VM-Entry host 3629 * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not 3630 * L1's "real" CR3, which causes nested_vmx_restore_host_state() to 3631 * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the 3632 * unwind naturally setting arch.cr3 to the correct value. Smashing 3633 * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind, 3634 * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be 3635 * overwritten with a shadow CR3 prior to re-entering L1. 3636 */ 3637 if (!enable_ept) 3638 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3639 3640 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3641 3642 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3643 3644 if (from_vmentry) { 3645 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3646 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3647 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3648 } 3649 3650 if (nested_vmx_check_controls_late(vcpu, vmcs12)) { 3651 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3652 return NVMX_VMENTRY_VMFAIL; 3653 } 3654 3655 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3656 &entry_failure_code)) { 3657 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3658 vmcs12->exit_qualification = entry_failure_code; 3659 goto vmentry_fail_vmexit; 3660 } 3661 } 3662 3663 enter_guest_mode(vcpu); 3664 3665 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3666 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3667 vmcs12->exit_qualification = entry_failure_code; 3668 goto vmentry_fail_vmexit_guest_mode; 3669 } 3670 3671 if (from_vmentry) { 3672 failed_index = nested_vmx_load_msr(vcpu, 3673 vmcs12->vm_entry_msr_load_addr, 3674 vmcs12->vm_entry_msr_load_count); 3675 if (failed_index) { 3676 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3677 vmcs12->exit_qualification = failed_index; 3678 goto vmentry_fail_vmexit_guest_mode; 3679 } 3680 } else { 3681 /* 3682 * The MMU is not initialized to point at the right entities yet and 3683 * "get pages" would need to read data from the guest (i.e. we will 3684 * need to perform gpa to hpa translation). Request a call 3685 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3686 * have already been set at vmentry time and should not be reset. 3687 */ 3688 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3689 } 3690 3691 /* 3692 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3693 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3694 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3695 * unconditionally. Take care to pull data from vmcs01 as appropriate, 3696 * e.g. when checking for interrupt windows, as vmcs02 is now loaded. 3697 */ 3698 if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | 3699 CPU_BASED_NMI_WINDOW_EXITING)) || 3700 kvm_apic_has_pending_init_or_sipi(vcpu) || 3701 kvm_apic_has_interrupt(vcpu)) 3702 kvm_make_request(KVM_REQ_EVENT, vcpu); 3703 3704 /* 3705 * Do not start the preemption timer hrtimer until after we know 3706 * we are successful, so that only nested_vmx_vmexit needs to cancel 3707 * the timer. 3708 */ 3709 vmx->nested.preemption_timer_expired = false; 3710 if (nested_cpu_has_preemption_timer(vmcs12)) { 3711 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3712 vmx_start_preemption_timer(vcpu, timer_value); 3713 } 3714 3715 /* 3716 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3717 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3718 * returned as far as L1 is concerned. It will only return (and set 3719 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3720 */ 3721 return NVMX_VMENTRY_SUCCESS; 3722 3723 /* 3724 * A failed consistency check that leads to a VMExit during L1's 3725 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3726 * 26.7 "VM-entry failures during or after loading guest state". 3727 */ 3728 vmentry_fail_vmexit_guest_mode: 3729 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3730 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3731 leave_guest_mode(vcpu); 3732 3733 vmentry_fail_vmexit: 3734 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3735 3736 if (!from_vmentry) 3737 return NVMX_VMENTRY_VMEXIT; 3738 3739 load_vmcs12_host_state(vcpu, vmcs12); 3740 vmcs12->vm_exit_reason = exit_reason.full; 3741 if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) 3742 vmx->nested.need_vmcs12_to_shadow_sync = true; 3743 return NVMX_VMENTRY_VMEXIT; 3744 } 3745 3746 /* 3747 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3748 * for running an L2 nested guest. 3749 */ 3750 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3751 { 3752 struct vmcs12 *vmcs12; 3753 enum nvmx_vmentry_status status; 3754 struct vcpu_vmx *vmx = to_vmx(vcpu); 3755 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3756 enum nested_evmptrld_status evmptrld_status; 3757 3758 if (!nested_vmx_check_permission(vcpu)) 3759 return 1; 3760 3761 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3762 if (evmptrld_status == EVMPTRLD_ERROR) { 3763 kvm_queue_exception(vcpu, UD_VECTOR); 3764 return 1; 3765 } 3766 3767 kvm_pmu_branch_retired(vcpu); 3768 3769 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3770 return nested_vmx_failInvalid(vcpu); 3771 3772 if (CC(!nested_vmx_is_evmptr12_valid(vmx) && 3773 vmx->nested.current_vmptr == INVALID_GPA)) 3774 return nested_vmx_failInvalid(vcpu); 3775 3776 vmcs12 = get_vmcs12(vcpu); 3777 3778 /* 3779 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3780 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3781 * rather than RFLAGS.ZF, and no error number is stored to the 3782 * VM-instruction error field. 3783 */ 3784 if (CC(vmcs12->hdr.shadow_vmcs)) 3785 return nested_vmx_failInvalid(vcpu); 3786 3787 if (nested_vmx_is_evmptr12_valid(vmx)) { 3788 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 3789 3790 copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); 3791 /* Enlightened VMCS doesn't have launch state */ 3792 vmcs12->launch_state = !launch; 3793 } else if (enable_shadow_vmcs) { 3794 copy_shadow_to_vmcs12(vmx); 3795 } 3796 3797 /* 3798 * The nested entry process starts with enforcing various prerequisites 3799 * on vmcs12 as required by the Intel SDM, and act appropriately when 3800 * they fail: As the SDM explains, some conditions should cause the 3801 * instruction to fail, while others will cause the instruction to seem 3802 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3803 * To speed up the normal (success) code path, we should avoid checking 3804 * for misconfigurations which will anyway be caught by the processor 3805 * when using the merged vmcs02. 3806 */ 3807 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3808 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3809 3810 if (CC(vmcs12->launch_state == launch)) 3811 return nested_vmx_fail(vcpu, 3812 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3813 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3814 3815 if (nested_vmx_check_controls(vcpu, vmcs12)) 3816 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3817 3818 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3819 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3820 3821 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3822 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3823 3824 /* 3825 * We're finally done with prerequisite checking, and can start with 3826 * the nested entry. 3827 */ 3828 vmx->nested.nested_run_pending = 1; 3829 vmx->nested.has_preemption_timer_deadline = false; 3830 status = nested_vmx_enter_non_root_mode(vcpu, true); 3831 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3832 goto vmentry_failed; 3833 3834 /* Hide L1D cache contents from the nested guest. */ 3835 kvm_request_l1tf_flush_l1d(); 3836 3837 /* 3838 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3839 * also be used as part of restoring nVMX state for 3840 * snapshot restore (migration). 3841 * 3842 * In this flow, it is assumed that vmcs12 cache was 3843 * transferred as part of captured nVMX state and should 3844 * therefore not be read from guest memory (which may not 3845 * exist on destination host yet). 3846 */ 3847 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3848 3849 switch (vmcs12->guest_activity_state) { 3850 case GUEST_ACTIVITY_HLT: 3851 /* 3852 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3853 * awakened by event injection or by an NMI-window VM-exit or 3854 * by an interrupt-window VM-exit, halt the vcpu. 3855 */ 3856 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3857 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3858 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3859 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3860 vmx->nested.nested_run_pending = 0; 3861 return kvm_emulate_halt_noskip(vcpu); 3862 } 3863 break; 3864 case GUEST_ACTIVITY_WAIT_SIPI: 3865 vmx->nested.nested_run_pending = 0; 3866 kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); 3867 break; 3868 default: 3869 break; 3870 } 3871 3872 return 1; 3873 3874 vmentry_failed: 3875 vmx->nested.nested_run_pending = 0; 3876 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3877 return 0; 3878 if (status == NVMX_VMENTRY_VMEXIT) 3879 return 1; 3880 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3881 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3882 } 3883 3884 /* 3885 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3886 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3887 * This function returns the new value we should put in vmcs12.guest_cr0. 3888 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3889 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3890 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3891 * didn't trap the bit, because if L1 did, so would L0). 3892 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3893 * been modified by L2, and L1 knows it. So just leave the old value of 3894 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3895 * isn't relevant, because if L0 traps this bit it can set it to anything. 3896 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3897 * changed these bits, and therefore they need to be updated, but L0 3898 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3899 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3900 */ 3901 static inline unsigned long 3902 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3903 { 3904 return 3905 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3906 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3907 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3908 vcpu->arch.cr0_guest_owned_bits)); 3909 } 3910 3911 static inline unsigned long 3912 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3913 { 3914 return 3915 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3916 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3917 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3918 vcpu->arch.cr4_guest_owned_bits)); 3919 } 3920 3921 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3922 struct vmcs12 *vmcs12, 3923 u32 vm_exit_reason, u32 exit_intr_info) 3924 { 3925 u32 idt_vectoring; 3926 unsigned int nr; 3927 3928 /* 3929 * Per the SDM, VM-Exits due to double and triple faults are never 3930 * considered to occur during event delivery, even if the double/triple 3931 * fault is the result of an escalating vectoring issue. 3932 * 3933 * Note, the SDM qualifies the double fault behavior with "The original 3934 * event results in a double-fault exception". It's unclear why the 3935 * qualification exists since exits due to double fault can occur only 3936 * while vectoring a different exception (injected events are never 3937 * subject to interception), i.e. there's _always_ an original event. 3938 * 3939 * The SDM also uses NMI as a confusing example for the "original event 3940 * causes the VM exit directly" clause. NMI isn't special in any way, 3941 * the same rule applies to all events that cause an exit directly. 3942 * NMI is an odd choice for the example because NMIs can only occur on 3943 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3944 */ 3945 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3946 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3947 is_double_fault(exit_intr_info))) { 3948 vmcs12->idt_vectoring_info_field = 0; 3949 } else if (vcpu->arch.exception.injected) { 3950 nr = vcpu->arch.exception.vector; 3951 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3952 3953 if (kvm_exception_is_soft(nr)) { 3954 vmcs12->vm_exit_instruction_len = 3955 vcpu->arch.event_exit_inst_len; 3956 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3957 } else 3958 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3959 3960 if (vcpu->arch.exception.has_error_code) { 3961 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3962 vmcs12->idt_vectoring_error_code = 3963 vcpu->arch.exception.error_code; 3964 } 3965 3966 vmcs12->idt_vectoring_info_field = idt_vectoring; 3967 } else if (vcpu->arch.nmi_injected) { 3968 vmcs12->idt_vectoring_info_field = 3969 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3970 } else if (vcpu->arch.interrupt.injected) { 3971 nr = vcpu->arch.interrupt.nr; 3972 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3973 3974 if (vcpu->arch.interrupt.soft) { 3975 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3976 vmcs12->vm_entry_instruction_len = 3977 vcpu->arch.event_exit_inst_len; 3978 } else 3979 idt_vectoring |= INTR_TYPE_EXT_INTR; 3980 3981 vmcs12->idt_vectoring_info_field = idt_vectoring; 3982 } else { 3983 vmcs12->idt_vectoring_info_field = 0; 3984 } 3985 } 3986 3987 3988 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3989 { 3990 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3991 gfn_t gfn; 3992 3993 /* 3994 * Don't need to mark the APIC access page dirty; it is never 3995 * written to by the CPU during APIC virtualization. 3996 */ 3997 3998 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3999 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 4000 kvm_vcpu_mark_page_dirty(vcpu, gfn); 4001 } 4002 4003 if (nested_cpu_has_posted_intr(vmcs12)) { 4004 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 4005 kvm_vcpu_mark_page_dirty(vcpu, gfn); 4006 } 4007 } 4008 4009 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 4010 { 4011 struct vcpu_vmx *vmx = to_vmx(vcpu); 4012 int max_irr; 4013 void *vapic_page; 4014 u16 status; 4015 4016 if (!vmx->nested.pi_pending) 4017 return 0; 4018 4019 if (!vmx->nested.pi_desc) 4020 goto mmio_needed; 4021 4022 vmx->nested.pi_pending = false; 4023 4024 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 4025 return 0; 4026 4027 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4028 if (max_irr > 0) { 4029 vapic_page = vmx->nested.virtual_apic_map.hva; 4030 if (!vapic_page) 4031 goto mmio_needed; 4032 4033 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 4034 vapic_page, &max_irr); 4035 status = vmcs_read16(GUEST_INTR_STATUS); 4036 if ((u8)max_irr > ((u8)status & 0xff)) { 4037 status &= ~0xff; 4038 status |= (u8)max_irr; 4039 vmcs_write16(GUEST_INTR_STATUS, status); 4040 } 4041 } 4042 4043 nested_mark_vmcs12_pages_dirty(vcpu); 4044 return 0; 4045 4046 mmio_needed: 4047 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 4048 return -ENXIO; 4049 } 4050 4051 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 4052 { 4053 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 4054 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 4055 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4056 unsigned long exit_qual; 4057 4058 if (ex->has_payload) { 4059 exit_qual = ex->payload; 4060 } else if (ex->vector == PF_VECTOR) { 4061 exit_qual = vcpu->arch.cr2; 4062 } else if (ex->vector == DB_VECTOR) { 4063 exit_qual = vcpu->arch.dr6; 4064 exit_qual &= ~DR6_BT; 4065 exit_qual ^= DR6_ACTIVE_LOW; 4066 } else { 4067 exit_qual = 0; 4068 } 4069 4070 /* 4071 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 4072 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 4073 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 4074 */ 4075 if (ex->has_error_code && is_protmode(vcpu)) { 4076 /* 4077 * Intel CPUs do not generate error codes with bits 31:16 set, 4078 * and more importantly VMX disallows setting bits 31:16 in the 4079 * injected error code for VM-Entry. Drop the bits to mimic 4080 * hardware and avoid inducing failure on nested VM-Entry if L1 4081 * chooses to inject the exception back to L2. AMD CPUs _do_ 4082 * generate "full" 32-bit error codes, so KVM allows userspace 4083 * to inject exception error codes with bits 31:16 set. 4084 */ 4085 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 4086 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 4087 } 4088 4089 if (kvm_exception_is_soft(ex->vector)) 4090 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 4091 else 4092 intr_info |= INTR_TYPE_HARD_EXCEPTION; 4093 4094 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 4095 vmx_get_nmi_mask(vcpu)) 4096 intr_info |= INTR_INFO_UNBLOCK_NMI; 4097 4098 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 4099 } 4100 4101 /* 4102 * Returns true if a debug trap is (likely) pending delivery. Infer the class 4103 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 4104 * Using the payload is flawed because code breakpoints (fault-like) and data 4105 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 4106 * this will return false positives if a to-be-injected code breakpoint #DB is 4107 * pending (from KVM's perspective, but not "pending" across an instruction 4108 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 4109 * too is trap-like. 4110 * 4111 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 4112 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 4113 * #DB has already happened), and MTF isn't marked pending on code breakpoints 4114 * from the emulator (because such #DBs are fault-like and thus don't trigger 4115 * actions that fire on instruction retire). 4116 */ 4117 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 4118 { 4119 if (!ex->pending || ex->vector != DB_VECTOR) 4120 return 0; 4121 4122 /* General Detect #DBs are always fault-like. */ 4123 return ex->payload & ~DR6_BD; 4124 } 4125 4126 /* 4127 * Returns true if there's a pending #DB exception that is lower priority than 4128 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 4129 * KVM, but could theoretically be injected by userspace. Note, this code is 4130 * imperfect, see above. 4131 */ 4132 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 4133 { 4134 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 4135 } 4136 4137 /* 4138 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 4139 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 4140 * represents these debug traps with a payload that is said to be compatible 4141 * with the 'pending debug exceptions' field, write the payload to the VMCS 4142 * field if a VM-exit is delivered before the debug trap. 4143 */ 4144 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 4145 { 4146 unsigned long pending_dbg; 4147 4148 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 4149 if (pending_dbg) 4150 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 4151 } 4152 4153 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 4154 { 4155 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 4156 to_vmx(vcpu)->nested.preemption_timer_expired; 4157 } 4158 4159 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) 4160 { 4161 struct vcpu_vmx *vmx = to_vmx(vcpu); 4162 void *vapic = vmx->nested.virtual_apic_map.hva; 4163 int max_irr, vppr; 4164 4165 if (nested_vmx_preemption_timer_pending(vcpu) || 4166 vmx->nested.mtf_pending) 4167 return true; 4168 4169 /* 4170 * Virtual Interrupt Delivery doesn't require manual injection. Either 4171 * the interrupt is already in GUEST_RVI and will be recognized by CPU 4172 * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move 4173 * the interrupt from the PIR to RVI prior to entering the guest. 4174 */ 4175 if (for_injection) 4176 return false; 4177 4178 if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || 4179 __vmx_interrupt_blocked(vcpu)) 4180 return false; 4181 4182 if (!vapic) 4183 return false; 4184 4185 vppr = *((u32 *)(vapic + APIC_PROCPRI)); 4186 4187 max_irr = vmx_get_rvi(); 4188 if ((max_irr & 0xf0) > (vppr & 0xf0)) 4189 return true; 4190 4191 if (vmx->nested.pi_pending && vmx->nested.pi_desc && 4192 pi_test_on(vmx->nested.pi_desc)) { 4193 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4194 if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) 4195 return true; 4196 } 4197 4198 return false; 4199 } 4200 4201 /* 4202 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 4203 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 4204 * and less minor edits to splice in the priority of VMX Non-Root specific 4205 * events, e.g. MTF and NMI/INTR-window exiting. 4206 * 4207 * 1 Hardware Reset and Machine Checks 4208 * - RESET 4209 * - Machine Check 4210 * 4211 * 2 Trap on Task Switch 4212 * - T flag in TSS is set (on task switch) 4213 * 4214 * 3 External Hardware Interventions 4215 * - FLUSH 4216 * - STOPCLK 4217 * - SMI 4218 * - INIT 4219 * 4220 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 4221 * 4222 * 4 Traps on Previous Instruction 4223 * - Breakpoints 4224 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 4225 * breakpoint, or #DB due to a split-lock access) 4226 * 4227 * 4.3 VMX-preemption timer expired VM-exit 4228 * 4229 * 4.6 NMI-window exiting VM-exit[2] 4230 * 4231 * 5 Nonmaskable Interrupts (NMI) 4232 * 4233 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4234 * 4235 * 6 Maskable Hardware Interrupts 4236 * 4237 * 7 Code Breakpoint Fault 4238 * 4239 * 8 Faults from Fetching Next Instruction 4240 * - Code-Segment Limit Violation 4241 * - Code Page Fault 4242 * - Control protection exception (missing ENDBRANCH at target of indirect 4243 * call or jump) 4244 * 4245 * 9 Faults from Decoding Next Instruction 4246 * - Instruction length > 15 bytes 4247 * - Invalid Opcode 4248 * - Coprocessor Not Available 4249 * 4250 *10 Faults on Executing Instruction 4251 * - Overflow 4252 * - Bound error 4253 * - Invalid TSS 4254 * - Segment Not Present 4255 * - Stack fault 4256 * - General Protection 4257 * - Data Page Fault 4258 * - Alignment Check 4259 * - x86 FPU Floating-point exception 4260 * - SIMD floating-point exception 4261 * - Virtualization exception 4262 * - Control protection exception 4263 * 4264 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4265 * INIT signals, and higher priority events take priority over MTF VM exits. 4266 * MTF VM exits take priority over debug-trap exceptions and lower priority 4267 * events. 4268 * 4269 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4270 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4271 * timer take priority over VM exits caused by the "NMI-window exiting" 4272 * VM-execution control and lower priority events. 4273 * 4274 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4275 * caused by "NMI-window exiting". VM exits caused by this control take 4276 * priority over non-maskable interrupts (NMIs) and lower priority events. 4277 * 4278 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4279 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4280 * non-maskable interrupts (NMIs) and higher priority events take priority over 4281 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4282 * priority over external interrupts and lower priority events. 4283 */ 4284 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4285 { 4286 struct kvm_lapic *apic = vcpu->arch.apic; 4287 struct vcpu_vmx *vmx = to_vmx(vcpu); 4288 /* 4289 * Only a pending nested run blocks a pending exception. If there is a 4290 * previously injected event, the pending exception occurred while said 4291 * event was being delivered and thus needs to be handled. 4292 */ 4293 bool block_nested_exceptions = vmx->nested.nested_run_pending; 4294 /* 4295 * Events that don't require injection, i.e. that are virtualized by 4296 * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need 4297 * to regain control in order to deliver the event, and hardware will 4298 * handle event ordering, e.g. with respect to injected exceptions. 4299 * 4300 * But, new events (not exceptions) are only recognized at instruction 4301 * boundaries. If an event needs reinjection, then KVM is handling a 4302 * VM-Exit that occurred _during_ instruction execution; new events, 4303 * irrespective of whether or not they're injected, are blocked until 4304 * the instruction completes. 4305 */ 4306 bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); 4307 /* 4308 * Inject events are blocked by nested VM-Enter, as KVM is responsible 4309 * for managing priority between concurrent events, i.e. KVM needs to 4310 * wait until after VM-Enter completes to deliver injected events. 4311 */ 4312 bool block_nested_events = block_nested_exceptions || 4313 block_non_injected_events; 4314 4315 if (lapic_in_kernel(vcpu) && 4316 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4317 if (block_nested_events) 4318 return -EBUSY; 4319 nested_vmx_update_pending_dbg(vcpu); 4320 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4321 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4322 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4323 4324 /* MTF is discarded if the vCPU is in WFS. */ 4325 vmx->nested.mtf_pending = false; 4326 return 0; 4327 } 4328 4329 if (lapic_in_kernel(vcpu) && 4330 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4331 if (block_nested_events) 4332 return -EBUSY; 4333 4334 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4335 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4336 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4337 apic->sipi_vector & 0xFFUL); 4338 return 0; 4339 } 4340 /* Fallthrough, the SIPI is completely ignored. */ 4341 } 4342 4343 /* 4344 * Process exceptions that are higher priority than Monitor Trap Flag: 4345 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4346 * could theoretically come in from userspace), and ICEBP (INT1). 4347 * 4348 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4349 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4350 * across SMI/RSM as it should; that needs to be addressed in order to 4351 * prioritize SMI over MTF and trap-like #DBs. 4352 */ 4353 if (vcpu->arch.exception_vmexit.pending && 4354 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4355 if (block_nested_exceptions) 4356 return -EBUSY; 4357 4358 nested_vmx_inject_exception_vmexit(vcpu); 4359 return 0; 4360 } 4361 4362 if (vcpu->arch.exception.pending && 4363 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4364 if (block_nested_exceptions) 4365 return -EBUSY; 4366 goto no_vmexit; 4367 } 4368 4369 if (vmx->nested.mtf_pending) { 4370 if (block_nested_events) 4371 return -EBUSY; 4372 nested_vmx_update_pending_dbg(vcpu); 4373 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4374 return 0; 4375 } 4376 4377 if (vcpu->arch.exception_vmexit.pending) { 4378 if (block_nested_exceptions) 4379 return -EBUSY; 4380 4381 nested_vmx_inject_exception_vmexit(vcpu); 4382 return 0; 4383 } 4384 4385 if (vcpu->arch.exception.pending) { 4386 if (block_nested_exceptions) 4387 return -EBUSY; 4388 goto no_vmexit; 4389 } 4390 4391 if (nested_vmx_preemption_timer_pending(vcpu)) { 4392 if (block_nested_events) 4393 return -EBUSY; 4394 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4395 return 0; 4396 } 4397 4398 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4399 if (block_nested_events) 4400 return -EBUSY; 4401 goto no_vmexit; 4402 } 4403 4404 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4405 if (block_nested_events) 4406 return -EBUSY; 4407 if (!nested_exit_on_nmi(vcpu)) 4408 goto no_vmexit; 4409 4410 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4411 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4412 INTR_INFO_VALID_MASK, 0); 4413 /* 4414 * The NMI-triggered VM exit counts as injection: 4415 * clear this one and block further NMIs. 4416 */ 4417 vcpu->arch.nmi_pending = 0; 4418 vmx_set_nmi_mask(vcpu, true); 4419 return 0; 4420 } 4421 4422 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4423 int irq; 4424 4425 if (!nested_exit_on_intr(vcpu)) { 4426 if (block_nested_events) 4427 return -EBUSY; 4428 4429 goto no_vmexit; 4430 } 4431 4432 if (!nested_exit_intr_ack_set(vcpu)) { 4433 if (block_nested_events) 4434 return -EBUSY; 4435 4436 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4437 return 0; 4438 } 4439 4440 irq = kvm_cpu_get_extint(vcpu); 4441 if (irq != -1) { 4442 if (block_nested_events) 4443 return -EBUSY; 4444 4445 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4446 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4447 return 0; 4448 } 4449 4450 irq = kvm_apic_has_interrupt(vcpu); 4451 if (WARN_ON_ONCE(irq < 0)) 4452 goto no_vmexit; 4453 4454 /* 4455 * If the IRQ is L2's PI notification vector, process posted 4456 * interrupts for L2 instead of injecting VM-Exit, as the 4457 * detection/morphing architecturally occurs when the IRQ is 4458 * delivered to the CPU. Note, only interrupts that are routed 4459 * through the local APIC trigger posted interrupt processing, 4460 * and enabling posted interrupts requires ACK-on-exit. 4461 */ 4462 if (irq == vmx->nested.posted_intr_nv) { 4463 /* 4464 * Nested posted interrupts are delivered via RVI, i.e. 4465 * aren't injected by KVM, and so can be queued even if 4466 * manual event injection is disallowed. 4467 */ 4468 if (block_non_injected_events) 4469 return -EBUSY; 4470 4471 vmx->nested.pi_pending = true; 4472 kvm_apic_clear_irr(vcpu, irq); 4473 goto no_vmexit; 4474 } 4475 4476 if (block_nested_events) 4477 return -EBUSY; 4478 4479 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4480 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4481 4482 /* 4483 * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must 4484 * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI 4485 * if APICv is active. 4486 */ 4487 kvm_apic_ack_interrupt(vcpu, irq); 4488 return 0; 4489 } 4490 4491 no_vmexit: 4492 return vmx_complete_nested_posted_interrupt(vcpu); 4493 } 4494 4495 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4496 { 4497 ktime_t remaining = 4498 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4499 u64 value; 4500 4501 if (ktime_to_ns(remaining) <= 0) 4502 return 0; 4503 4504 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4505 do_div(value, 1000000); 4506 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4507 } 4508 4509 static bool is_vmcs12_ext_field(unsigned long field) 4510 { 4511 switch (field) { 4512 case GUEST_ES_SELECTOR: 4513 case GUEST_CS_SELECTOR: 4514 case GUEST_SS_SELECTOR: 4515 case GUEST_DS_SELECTOR: 4516 case GUEST_FS_SELECTOR: 4517 case GUEST_GS_SELECTOR: 4518 case GUEST_LDTR_SELECTOR: 4519 case GUEST_TR_SELECTOR: 4520 case GUEST_ES_LIMIT: 4521 case GUEST_CS_LIMIT: 4522 case GUEST_SS_LIMIT: 4523 case GUEST_DS_LIMIT: 4524 case GUEST_FS_LIMIT: 4525 case GUEST_GS_LIMIT: 4526 case GUEST_LDTR_LIMIT: 4527 case GUEST_TR_LIMIT: 4528 case GUEST_GDTR_LIMIT: 4529 case GUEST_IDTR_LIMIT: 4530 case GUEST_ES_AR_BYTES: 4531 case GUEST_DS_AR_BYTES: 4532 case GUEST_FS_AR_BYTES: 4533 case GUEST_GS_AR_BYTES: 4534 case GUEST_LDTR_AR_BYTES: 4535 case GUEST_TR_AR_BYTES: 4536 case GUEST_ES_BASE: 4537 case GUEST_CS_BASE: 4538 case GUEST_SS_BASE: 4539 case GUEST_DS_BASE: 4540 case GUEST_FS_BASE: 4541 case GUEST_GS_BASE: 4542 case GUEST_LDTR_BASE: 4543 case GUEST_TR_BASE: 4544 case GUEST_GDTR_BASE: 4545 case GUEST_IDTR_BASE: 4546 case GUEST_PENDING_DBG_EXCEPTIONS: 4547 case GUEST_BNDCFGS: 4548 return true; 4549 default: 4550 break; 4551 } 4552 4553 return false; 4554 } 4555 4556 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4557 struct vmcs12 *vmcs12) 4558 { 4559 struct vcpu_vmx *vmx = to_vmx(vcpu); 4560 4561 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4562 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4563 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4564 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4565 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4566 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4567 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4568 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4569 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4570 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4571 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4572 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4573 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4574 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4575 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4576 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4577 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4578 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4579 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4580 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4581 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4582 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4583 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4584 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4585 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4586 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4587 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4588 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4589 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4590 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4591 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4592 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4593 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4594 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4595 vmcs12->guest_pending_dbg_exceptions = 4596 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4597 4598 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4599 } 4600 4601 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4602 struct vmcs12 *vmcs12) 4603 { 4604 struct vcpu_vmx *vmx = to_vmx(vcpu); 4605 int cpu; 4606 4607 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4608 return; 4609 4610 4611 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4612 4613 cpu = get_cpu(); 4614 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4615 vmx_vcpu_load_vmcs(vcpu, cpu); 4616 4617 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4618 4619 vmx->loaded_vmcs = &vmx->vmcs01; 4620 vmx_vcpu_load_vmcs(vcpu, cpu); 4621 put_cpu(); 4622 } 4623 4624 /* 4625 * Update the guest state fields of vmcs12 to reflect changes that 4626 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4627 * VM-entry controls is also updated, since this is really a guest 4628 * state bit.) 4629 */ 4630 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4631 { 4632 struct vcpu_vmx *vmx = to_vmx(vcpu); 4633 4634 if (nested_vmx_is_evmptr12_valid(vmx)) 4635 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4636 4637 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4638 !nested_vmx_is_evmptr12_valid(vmx); 4639 4640 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4641 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4642 4643 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4644 vmcs12->guest_rip = kvm_rip_read(vcpu); 4645 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4646 4647 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4648 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4649 4650 vmcs12->guest_interruptibility_info = 4651 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4652 4653 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4654 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4655 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4656 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4657 else 4658 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4659 4660 if (nested_cpu_has_preemption_timer(vmcs12) && 4661 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4662 !vmx->nested.nested_run_pending) 4663 vmcs12->vmx_preemption_timer_value = 4664 vmx_get_preemption_timer_value(vcpu); 4665 4666 /* 4667 * In some cases (usually, nested EPT), L2 is allowed to change its 4668 * own CR3 without exiting. If it has changed it, we must keep it. 4669 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4670 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4671 * 4672 * Additionally, restore L2's PDPTR to vmcs12. 4673 */ 4674 if (enable_ept) { 4675 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4676 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4677 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4678 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4679 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4680 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4681 } 4682 } 4683 4684 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4685 4686 if (nested_cpu_has_vid(vmcs12)) 4687 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4688 4689 vmcs12->vm_entry_controls = 4690 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4691 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4692 4693 /* 4694 * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. 4695 * Writes to DEBUGCTL that aren't intercepted by L1 are immediately 4696 * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into 4697 * vmcs02 doesn't strictly track vmcs12. 4698 */ 4699 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4700 vmcs12->guest_dr7 = vcpu->arch.dr7; 4701 4702 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4703 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4704 4705 vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet, 4706 &vmcs12->guest_ssp, 4707 &vmcs12->guest_ssp_tbl); 4708 } 4709 4710 /* 4711 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4712 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4713 * and this function updates it to reflect the changes to the guest state while 4714 * L2 was running (and perhaps made some exits which were handled directly by L0 4715 * without going back to L1), and to reflect the exit reason. 4716 * Note that we do not have to copy here all VMCS fields, just those that 4717 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4718 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4719 * which already writes to vmcs12 directly. 4720 */ 4721 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4722 u32 vm_exit_reason, u32 exit_intr_info, 4723 unsigned long exit_qualification, u32 exit_insn_len) 4724 { 4725 /* update exit information fields: */ 4726 vmcs12->vm_exit_reason = vm_exit_reason; 4727 if (vmx_get_exit_reason(vcpu).enclave_mode) 4728 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4729 vmcs12->exit_qualification = exit_qualification; 4730 4731 /* 4732 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4733 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4734 * exit info fields are unmodified. 4735 */ 4736 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4737 vmcs12->launch_state = 1; 4738 4739 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4740 * instead of reading the real value. */ 4741 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4742 4743 /* 4744 * Transfer the event that L0 or L1 may wanted to inject into 4745 * L2 to IDT_VECTORING_INFO_FIELD. 4746 */ 4747 vmcs12_save_pending_event(vcpu, vmcs12, 4748 vm_exit_reason, exit_intr_info); 4749 4750 vmcs12->vm_exit_intr_info = exit_intr_info; 4751 vmcs12->vm_exit_instruction_len = exit_insn_len; 4752 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4753 4754 /* 4755 * According to spec, there's no need to store the guest's 4756 * MSRs if the exit is due to a VM-entry failure that occurs 4757 * during or after loading the guest state. Since this exit 4758 * does not fall in that category, we need to save the MSRs. 4759 */ 4760 if (nested_vmx_store_msr(vcpu, 4761 vmcs12->vm_exit_msr_store_addr, 4762 vmcs12->vm_exit_msr_store_count)) 4763 nested_vmx_abort(vcpu, 4764 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4765 } 4766 } 4767 4768 /* 4769 * A part of what we need to when the nested L2 guest exits and we want to 4770 * run its L1 parent, is to reset L1's guest state to the host state specified 4771 * in vmcs12. 4772 * This function is to be called not only on normal nested exit, but also on 4773 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4774 * Failures During or After Loading Guest State"). 4775 * This function should be called when the active VMCS is L1's (vmcs01). 4776 */ 4777 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4778 struct vmcs12 *vmcs12) 4779 { 4780 enum vm_entry_failure_code ignored; 4781 struct kvm_segment seg; 4782 4783 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4784 vcpu->arch.efer = vmcs12->host_ia32_efer; 4785 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4786 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4787 else 4788 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4789 vmx_set_efer(vcpu, vcpu->arch.efer); 4790 4791 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4792 kvm_rip_write(vcpu, vmcs12->host_rip); 4793 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4794 vmx_set_interrupt_shadow(vcpu, 0); 4795 4796 /* 4797 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4798 * actually changed, because vmx_set_cr0 refers to efer set above. 4799 * 4800 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4801 * (KVM doesn't change it); 4802 */ 4803 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4804 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4805 4806 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4807 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4808 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4809 4810 nested_ept_uninit_mmu_context(vcpu); 4811 4812 /* 4813 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4814 * couldn't have changed. 4815 */ 4816 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4817 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4818 4819 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4820 4821 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4822 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4823 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4824 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4825 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4826 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4827 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4828 4829 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4830 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4831 vmcs_write64(GUEST_BNDCFGS, 0); 4832 4833 /* 4834 * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set. 4835 * otherwise CET state should be retained across VM-exit, i.e., 4836 * guest values should be propagated from vmcs12 to vmcs01. 4837 */ 4838 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) 4839 vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, 4840 vmcs12->host_ssp_tbl); 4841 else 4842 vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, 4843 vmcs12->guest_ssp_tbl); 4844 4845 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4846 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4847 vcpu->arch.pat = vmcs12->host_ia32_pat; 4848 } 4849 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4850 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4851 WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4852 vmcs12->host_ia32_perf_global_ctrl)); 4853 4854 /* Set L1 segment info according to Intel SDM 4855 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4856 seg = (struct kvm_segment) { 4857 .base = 0, 4858 .limit = 0xFFFFFFFF, 4859 .selector = vmcs12->host_cs_selector, 4860 .type = 11, 4861 .present = 1, 4862 .s = 1, 4863 .g = 1 4864 }; 4865 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4866 seg.l = 1; 4867 else 4868 seg.db = 1; 4869 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4870 seg = (struct kvm_segment) { 4871 .base = 0, 4872 .limit = 0xFFFFFFFF, 4873 .type = 3, 4874 .present = 1, 4875 .s = 1, 4876 .db = 1, 4877 .g = 1 4878 }; 4879 seg.selector = vmcs12->host_ds_selector; 4880 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4881 seg.selector = vmcs12->host_es_selector; 4882 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4883 seg.selector = vmcs12->host_ss_selector; 4884 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4885 seg.selector = vmcs12->host_fs_selector; 4886 seg.base = vmcs12->host_fs_base; 4887 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4888 seg.selector = vmcs12->host_gs_selector; 4889 seg.base = vmcs12->host_gs_base; 4890 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4891 seg = (struct kvm_segment) { 4892 .base = vmcs12->host_tr_base, 4893 .limit = 0x67, 4894 .selector = vmcs12->host_tr_selector, 4895 .type = 11, 4896 .present = 1 4897 }; 4898 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4899 4900 memset(&seg, 0, sizeof(seg)); 4901 seg.unusable = 1; 4902 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4903 4904 kvm_set_dr(vcpu, 7, 0x400); 4905 vmx_guest_debugctl_write(vcpu, 0); 4906 4907 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4908 vmcs12->vm_exit_msr_load_count)) 4909 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4910 4911 to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4912 } 4913 4914 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4915 { 4916 struct vmx_uret_msr *efer_msr; 4917 unsigned int i; 4918 4919 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4920 return vmcs_read64(GUEST_IA32_EFER); 4921 4922 if (cpu_has_load_ia32_efer()) 4923 return kvm_host.efer; 4924 4925 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4926 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4927 return vmx->msr_autoload.guest.val[i].value; 4928 } 4929 4930 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4931 if (efer_msr) 4932 return efer_msr->data; 4933 4934 return kvm_host.efer; 4935 } 4936 4937 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4938 { 4939 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4940 struct vcpu_vmx *vmx = to_vmx(vcpu); 4941 struct vmx_msr_entry g, h; 4942 gpa_t gpa; 4943 u32 i, j; 4944 4945 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4946 4947 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4948 /* 4949 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4950 * as vmcs01.GUEST_DR7 contains a userspace defined value 4951 * and vcpu->arch.dr7 is not squirreled away before the 4952 * nested VMENTER (not worth adding a variable in nested_vmx). 4953 */ 4954 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4955 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4956 else 4957 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4958 } 4959 4960 /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ 4961 vmx_reload_guest_debugctl(vcpu); 4962 4963 /* 4964 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4965 * handle a variety of side effects to KVM's software model. 4966 */ 4967 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4968 4969 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4970 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4971 4972 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4973 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4974 4975 nested_ept_uninit_mmu_context(vcpu); 4976 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4977 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4978 4979 /* 4980 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4981 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4982 * VMFail, like everything else we just need to ensure our 4983 * software model is up-to-date. 4984 */ 4985 if (enable_ept && is_pae_paging(vcpu)) 4986 ept_save_pdptrs(vcpu); 4987 4988 kvm_mmu_reset_context(vcpu); 4989 4990 /* 4991 * This nasty bit of open coding is a compromise between blindly 4992 * loading L1's MSRs using the exit load lists (incorrect emulation 4993 * of VMFail), leaving the nested VM's MSRs in the software model 4994 * (incorrect behavior) and snapshotting the modified MSRs (too 4995 * expensive since the lists are unbound by hardware). For each 4996 * MSR that was (prematurely) loaded from the nested VMEntry load 4997 * list, reload it from the exit load list if it exists and differs 4998 * from the guest value. The intent is to stuff host state as 4999 * silently as possible, not to fully process the exit load list. 5000 */ 5001 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 5002 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 5003 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 5004 pr_debug_ratelimited( 5005 "%s read MSR index failed (%u, 0x%08llx)\n", 5006 __func__, i, gpa); 5007 goto vmabort; 5008 } 5009 5010 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 5011 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 5012 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 5013 pr_debug_ratelimited( 5014 "%s read MSR failed (%u, 0x%08llx)\n", 5015 __func__, j, gpa); 5016 goto vmabort; 5017 } 5018 if (h.index != g.index) 5019 continue; 5020 if (h.value == g.value) 5021 break; 5022 5023 if (nested_vmx_load_msr_check(vcpu, &h)) { 5024 pr_debug_ratelimited( 5025 "%s check failed (%u, 0x%x, 0x%x)\n", 5026 __func__, j, h.index, h.reserved); 5027 goto vmabort; 5028 } 5029 5030 if (kvm_emulate_msr_write(vcpu, h.index, h.value)) { 5031 pr_debug_ratelimited( 5032 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 5033 __func__, j, h.index, h.value); 5034 goto vmabort; 5035 } 5036 } 5037 } 5038 5039 return; 5040 5041 vmabort: 5042 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 5043 } 5044 5045 /* 5046 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 5047 * and modify vmcs12 to make it see what it would expect to see there if 5048 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 5049 */ 5050 void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 5051 u32 exit_intr_info, unsigned long exit_qualification, 5052 u32 exit_insn_len) 5053 { 5054 struct vcpu_vmx *vmx = to_vmx(vcpu); 5055 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5056 5057 /* Pending MTF traps are discarded on VM-Exit. */ 5058 vmx->nested.mtf_pending = false; 5059 5060 /* trying to cancel vmlaunch/vmresume is a bug */ 5061 WARN_ON_ONCE(vmx->nested.nested_run_pending); 5062 5063 #ifdef CONFIG_KVM_HYPERV 5064 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 5065 /* 5066 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 5067 * Enlightened VMCS after migration and we still need to 5068 * do that when something is forcing L2->L1 exit prior to 5069 * the first L2 run. 5070 */ 5071 (void)nested_get_evmcs_page(vcpu); 5072 } 5073 #endif 5074 5075 /* Service pending TLB flush requests for L2 before switching to L1. */ 5076 kvm_service_local_tlb_flush_requests(vcpu); 5077 5078 /* 5079 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 5080 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 5081 * up-to-date before switching to L1. 5082 */ 5083 if (enable_ept && is_pae_paging(vcpu)) 5084 vmx_ept_load_pdptrs(vcpu); 5085 5086 leave_guest_mode(vcpu); 5087 5088 if (nested_cpu_has_preemption_timer(vmcs12)) 5089 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 5090 5091 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 5092 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 5093 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 5094 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 5095 } 5096 5097 if (likely(!vmx->fail)) { 5098 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 5099 5100 if (vm_exit_reason != -1) 5101 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 5102 exit_intr_info, exit_qualification, 5103 exit_insn_len); 5104 5105 /* 5106 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 5107 * also be used to capture vmcs12 cache as part of 5108 * capturing nVMX state for snapshot (migration). 5109 * 5110 * Otherwise, this flush will dirty guest memory at a 5111 * point it is already assumed by user-space to be 5112 * immutable. 5113 */ 5114 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 5115 } else { 5116 /* 5117 * The only expected VM-instruction error is "VM entry with 5118 * invalid control field(s)." Anything else indicates a 5119 * problem with L0. 5120 */ 5121 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 5122 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5123 5124 /* VM-Fail at VM-Entry means KVM missed a consistency check. */ 5125 WARN_ON_ONCE(warn_on_missed_cc); 5126 } 5127 5128 /* 5129 * Drop events/exceptions that were queued for re-injection to L2 5130 * (picked up via vmx_complete_interrupts()), as well as exceptions 5131 * that were pending for L2. Note, this must NOT be hoisted above 5132 * prepare_vmcs12(), events/exceptions queued for re-injection need to 5133 * be captured in vmcs12 (see vmcs12_save_pending_event()). 5134 */ 5135 vcpu->arch.nmi_injected = false; 5136 kvm_clear_exception_queue(vcpu); 5137 kvm_clear_interrupt_queue(vcpu); 5138 5139 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 5140 5141 kvm_nested_vmexit_handle_ibrs(vcpu); 5142 5143 /* Update any VMCS fields that might have changed while L2 ran */ 5144 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 5145 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 5146 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 5147 if (kvm_caps.has_tsc_control) 5148 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 5149 5150 if (vmx->nested.l1_tpr_threshold != -1) 5151 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 5152 5153 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 5154 vmx->nested.change_vmcs01_virtual_apic_mode = false; 5155 vmx_set_virtual_apic_mode(vcpu); 5156 } 5157 5158 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 5159 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 5160 vmx_update_cpu_dirty_logging(vcpu); 5161 } 5162 5163 nested_put_vmcs12_pages(vcpu); 5164 5165 if (vmx->nested.reload_vmcs01_apic_access_page) { 5166 vmx->nested.reload_vmcs01_apic_access_page = false; 5167 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 5168 } 5169 5170 if (vmx->nested.update_vmcs01_apicv_status) { 5171 vmx->nested.update_vmcs01_apicv_status = false; 5172 vmx_refresh_apicv_exec_ctrl(vcpu); 5173 } 5174 5175 if (vmx->nested.update_vmcs01_hwapic_isr) { 5176 vmx->nested.update_vmcs01_hwapic_isr = false; 5177 kvm_apic_update_hwapic_isr(vcpu); 5178 } 5179 5180 if ((vm_exit_reason != -1) && 5181 (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) 5182 vmx->nested.need_vmcs12_to_shadow_sync = true; 5183 5184 /* in case we halted in L2 */ 5185 kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 5186 5187 if (likely(!vmx->fail)) { 5188 if (vm_exit_reason != -1) 5189 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 5190 vmcs12->exit_qualification, 5191 vmcs12->idt_vectoring_info_field, 5192 vmcs12->vm_exit_intr_info, 5193 vmcs12->vm_exit_intr_error_code, 5194 KVM_ISA_VMX); 5195 5196 load_vmcs12_host_state(vcpu, vmcs12); 5197 5198 /* 5199 * Process events if an injectable IRQ or NMI is pending, even 5200 * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). 5201 * If an event became pending while L2 was active, KVM needs to 5202 * either inject the event or request an IRQ/NMI window. SMIs 5203 * don't need to be processed as SMM is mutually exclusive with 5204 * non-root mode. INIT/SIPI don't need to be checked as INIT 5205 * is blocked post-VMXON, and SIPIs are ignored. 5206 */ 5207 if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) 5208 kvm_make_request(KVM_REQ_EVENT, vcpu); 5209 return; 5210 } 5211 5212 /* 5213 * After an early L2 VM-entry failure, we're now back 5214 * in L1 which thinks it just finished a VMLAUNCH or 5215 * VMRESUME instruction, so we need to set the failure 5216 * flag and the VM-instruction error field of the VMCS 5217 * accordingly, and skip the emulated instruction. 5218 */ 5219 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5220 5221 /* 5222 * Restore L1's host state to KVM's software model. We're here 5223 * because a consistency check was caught by hardware, which 5224 * means some amount of guest state has been propagated to KVM's 5225 * model and needs to be unwound to the host's state. 5226 */ 5227 nested_vmx_restore_host_state(vcpu); 5228 5229 vmx->fail = 0; 5230 } 5231 5232 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 5233 { 5234 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 5235 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 5236 } 5237 5238 /* 5239 * Decode the memory-address operand of a vmx instruction, as recorded on an 5240 * exit caused by such an instruction (run by a guest hypervisor). 5241 * On success, returns 0. When the operand is invalid, returns 1 and throws 5242 * #UD, #GP, or #SS. 5243 */ 5244 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 5245 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 5246 { 5247 gva_t off; 5248 bool exn; 5249 struct kvm_segment s; 5250 5251 /* 5252 * According to Vol. 3B, "Information for VM Exits Due to Instruction 5253 * Execution", on an exit, vmx_instruction_info holds most of the 5254 * addressing components of the operand. Only the displacement part 5255 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 5256 * For how an actual address is calculated from all these components, 5257 * refer to Vol. 1, "Operand Addressing". 5258 */ 5259 int scaling = vmx_instruction_info & 3; 5260 int addr_size = (vmx_instruction_info >> 7) & 7; 5261 bool is_reg = vmx_instruction_info & (1u << 10); 5262 int seg_reg = (vmx_instruction_info >> 15) & 7; 5263 int index_reg = (vmx_instruction_info >> 18) & 0xf; 5264 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 5265 int base_reg = (vmx_instruction_info >> 23) & 0xf; 5266 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 5267 5268 if (is_reg) { 5269 kvm_queue_exception(vcpu, UD_VECTOR); 5270 return 1; 5271 } 5272 5273 /* Addr = segment_base + offset */ 5274 /* offset = base + [index * scale] + displacement */ 5275 off = exit_qualification; /* holds the displacement */ 5276 if (addr_size == 1) 5277 off = (gva_t)sign_extend64(off, 31); 5278 else if (addr_size == 0) 5279 off = (gva_t)sign_extend64(off, 15); 5280 if (base_is_valid) 5281 off += kvm_register_read(vcpu, base_reg); 5282 if (index_is_valid) 5283 off += kvm_register_read(vcpu, index_reg) << scaling; 5284 vmx_get_segment(vcpu, &s, seg_reg); 5285 5286 /* 5287 * The effective address, i.e. @off, of a memory operand is truncated 5288 * based on the address size of the instruction. Note that this is 5289 * the *effective address*, i.e. the address prior to accounting for 5290 * the segment's base. 5291 */ 5292 if (addr_size == 1) /* 32 bit */ 5293 off &= 0xffffffff; 5294 else if (addr_size == 0) /* 16 bit */ 5295 off &= 0xffff; 5296 5297 /* Checks for #GP/#SS exceptions. */ 5298 exn = false; 5299 if (is_long_mode(vcpu)) { 5300 /* 5301 * The virtual/linear address is never truncated in 64-bit 5302 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 5303 * address when using FS/GS with a non-zero base. 5304 */ 5305 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 5306 *ret = s.base + off; 5307 else 5308 *ret = off; 5309 5310 *ret = vmx_get_untagged_addr(vcpu, *ret, 0); 5311 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 5312 * non-canonical form. This is the only check on the memory 5313 * destination for long mode! 5314 */ 5315 exn = is_noncanonical_address(*ret, vcpu, 0); 5316 } else { 5317 /* 5318 * When not in long mode, the virtual/linear address is 5319 * unconditionally truncated to 32 bits regardless of the 5320 * address size. 5321 */ 5322 *ret = (s.base + off) & 0xffffffff; 5323 5324 /* Protected mode: apply checks for segment validity in the 5325 * following order: 5326 * - segment type check (#GP(0) may be thrown) 5327 * - usability check (#GP(0)/#SS(0)) 5328 * - limit check (#GP(0)/#SS(0)) 5329 */ 5330 if (wr) 5331 /* #GP(0) if the destination operand is located in a 5332 * read-only data segment or any code segment. 5333 */ 5334 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5335 else 5336 /* #GP(0) if the source operand is located in an 5337 * execute-only code segment 5338 */ 5339 exn = ((s.type & 0xa) == 8); 5340 if (exn) { 5341 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5342 return 1; 5343 } 5344 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5345 */ 5346 exn = (s.unusable != 0); 5347 5348 /* 5349 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5350 * outside the segment limit. All CPUs that support VMX ignore 5351 * limit checks for flat segments, i.e. segments with base==0, 5352 * limit==0xffffffff and of type expand-up data or code. 5353 */ 5354 if (!(s.base == 0 && s.limit == 0xffffffff && 5355 ((s.type & 8) || !(s.type & 4)))) 5356 exn = exn || ((u64)off + len - 1 > s.limit); 5357 } 5358 if (exn) { 5359 kvm_queue_exception_e(vcpu, 5360 seg_reg == VCPU_SREG_SS ? 5361 SS_VECTOR : GP_VECTOR, 5362 0); 5363 return 1; 5364 } 5365 5366 return 0; 5367 } 5368 5369 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5370 int *ret) 5371 { 5372 gva_t gva; 5373 struct x86_exception e; 5374 int r; 5375 5376 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5377 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5378 sizeof(*vmpointer), &gva)) { 5379 *ret = 1; 5380 return -EINVAL; 5381 } 5382 5383 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5384 if (r != X86EMUL_CONTINUE) { 5385 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5386 return -EINVAL; 5387 } 5388 5389 return 0; 5390 } 5391 5392 /* 5393 * Allocate a shadow VMCS and associate it with the currently loaded 5394 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5395 * VMCS is also VMCLEARed, so that it is ready for use. 5396 */ 5397 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5398 { 5399 struct vcpu_vmx *vmx = to_vmx(vcpu); 5400 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5401 5402 /* 5403 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5404 * when L1 executes VMXOFF or the vCPU is forced out of nested 5405 * operation. VMXON faults if the CPU is already post-VMXON, so it 5406 * should be impossible to already have an allocated shadow VMCS. KVM 5407 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5408 * always be the loaded VMCS. 5409 */ 5410 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5411 return loaded_vmcs->shadow_vmcs; 5412 5413 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5414 if (loaded_vmcs->shadow_vmcs) 5415 vmcs_clear(loaded_vmcs->shadow_vmcs); 5416 5417 return loaded_vmcs->shadow_vmcs; 5418 } 5419 5420 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5421 { 5422 struct vcpu_vmx *vmx = to_vmx(vcpu); 5423 int r; 5424 5425 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5426 if (r < 0) 5427 goto out_vmcs02; 5428 5429 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5430 if (!vmx->nested.cached_vmcs12) 5431 goto out_cached_vmcs12; 5432 5433 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5434 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5435 if (!vmx->nested.cached_shadow_vmcs12) 5436 goto out_cached_shadow_vmcs12; 5437 5438 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5439 goto out_shadow_vmcs; 5440 5441 hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, 5442 HRTIMER_MODE_ABS_PINNED); 5443 5444 vmx->nested.vpid02 = allocate_vpid(); 5445 5446 vmx->nested.vmcs02_initialized = false; 5447 vmx->nested.vmxon = true; 5448 5449 if (vmx_pt_mode_is_host_guest()) { 5450 vmx->pt_desc.guest.ctl = 0; 5451 pt_update_intercept_for_msr(vcpu); 5452 } 5453 5454 return 0; 5455 5456 out_shadow_vmcs: 5457 kfree(vmx->nested.cached_shadow_vmcs12); 5458 5459 out_cached_shadow_vmcs12: 5460 kfree(vmx->nested.cached_vmcs12); 5461 5462 out_cached_vmcs12: 5463 free_loaded_vmcs(&vmx->nested.vmcs02); 5464 5465 out_vmcs02: 5466 return -ENOMEM; 5467 } 5468 5469 /* Emulate the VMXON instruction. */ 5470 static int handle_vmxon(struct kvm_vcpu *vcpu) 5471 { 5472 int ret; 5473 gpa_t vmptr; 5474 uint32_t revision; 5475 struct vcpu_vmx *vmx = to_vmx(vcpu); 5476 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5477 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5478 5479 /* 5480 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5481 * the guest and so cannot rely on hardware to perform the check, 5482 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5483 * for VMXON). 5484 * 5485 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5486 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5487 * force any of the relevant guest state. For a restricted guest, KVM 5488 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5489 * Real Mode, and so there's no need to check CR0.PE manually. 5490 */ 5491 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5492 kvm_queue_exception(vcpu, UD_VECTOR); 5493 return 1; 5494 } 5495 5496 /* 5497 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5498 * and has higher priority than the VM-Fail due to being post-VMXON, 5499 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5500 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5501 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5502 * VMX non-root. 5503 * 5504 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5505 * #UD checks (see above), is functionally ok because KVM doesn't allow 5506 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5507 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5508 * missed by hardware due to shadowing CR0 and/or CR4. 5509 */ 5510 if (vmx_get_cpl(vcpu)) { 5511 kvm_inject_gp(vcpu, 0); 5512 return 1; 5513 } 5514 5515 if (vmx->nested.vmxon) 5516 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5517 5518 /* 5519 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5520 * only if the vCPU isn't already in VMX operation, i.e. effectively 5521 * have lower priority than the VM-Fail above. 5522 */ 5523 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5524 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5525 kvm_inject_gp(vcpu, 0); 5526 return 1; 5527 } 5528 5529 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5530 != VMXON_NEEDED_FEATURES) { 5531 kvm_inject_gp(vcpu, 0); 5532 return 1; 5533 } 5534 5535 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5536 return ret; 5537 5538 /* 5539 * SDM 3: 24.11.5 5540 * The first 4 bytes of VMXON region contain the supported 5541 * VMCS revision identifier 5542 * 5543 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5544 * which replaces physical address width with 32 5545 */ 5546 if (!page_address_valid(vcpu, vmptr)) 5547 return nested_vmx_failInvalid(vcpu); 5548 5549 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5550 revision != VMCS12_REVISION) 5551 return nested_vmx_failInvalid(vcpu); 5552 5553 vmx->nested.vmxon_ptr = vmptr; 5554 ret = enter_vmx_operation(vcpu); 5555 if (ret) 5556 return ret; 5557 5558 return nested_vmx_succeed(vcpu); 5559 } 5560 5561 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5562 { 5563 struct vcpu_vmx *vmx = to_vmx(vcpu); 5564 5565 if (vmx->nested.current_vmptr == INVALID_GPA) 5566 return; 5567 5568 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5569 5570 if (enable_shadow_vmcs) { 5571 /* copy to memory all shadowed fields in case 5572 they were modified */ 5573 copy_shadow_to_vmcs12(vmx); 5574 vmx_disable_shadow_vmcs(vmx); 5575 } 5576 vmx->nested.posted_intr_nv = -1; 5577 5578 /* Flush VMCS12 to guest memory */ 5579 kvm_vcpu_write_guest_page(vcpu, 5580 vmx->nested.current_vmptr >> PAGE_SHIFT, 5581 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5582 5583 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5584 5585 vmx->nested.current_vmptr = INVALID_GPA; 5586 } 5587 5588 /* Emulate the VMXOFF instruction */ 5589 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5590 { 5591 if (!nested_vmx_check_permission(vcpu)) 5592 return 1; 5593 5594 free_nested(vcpu); 5595 5596 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5597 kvm_make_request(KVM_REQ_EVENT, vcpu); 5598 5599 return nested_vmx_succeed(vcpu); 5600 } 5601 5602 /* Emulate the VMCLEAR instruction */ 5603 static int handle_vmclear(struct kvm_vcpu *vcpu) 5604 { 5605 struct vcpu_vmx *vmx = to_vmx(vcpu); 5606 u32 zero = 0; 5607 gpa_t vmptr; 5608 int r; 5609 5610 if (!nested_vmx_check_permission(vcpu)) 5611 return 1; 5612 5613 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5614 return r; 5615 5616 if (!page_address_valid(vcpu, vmptr)) 5617 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5618 5619 if (vmptr == vmx->nested.vmxon_ptr) 5620 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5621 5622 if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { 5623 if (vmptr == vmx->nested.current_vmptr) 5624 nested_release_vmcs12(vcpu); 5625 5626 /* 5627 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5628 * for VMCLEAR includes a "ensure that data for VMCS referenced 5629 * by the operand is in memory" clause that guards writes to 5630 * memory, i.e. doing nothing for I/O is architecturally valid. 5631 * 5632 * FIXME: Suppress failures if and only if no memslot is found, 5633 * i.e. exit to userspace if __copy_to_user() fails. 5634 */ 5635 (void)kvm_vcpu_write_guest(vcpu, 5636 vmptr + offsetof(struct vmcs12, 5637 launch_state), 5638 &zero, sizeof(zero)); 5639 } 5640 5641 return nested_vmx_succeed(vcpu); 5642 } 5643 5644 /* Emulate the VMLAUNCH instruction */ 5645 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5646 { 5647 return nested_vmx_run(vcpu, true); 5648 } 5649 5650 /* Emulate the VMRESUME instruction */ 5651 static int handle_vmresume(struct kvm_vcpu *vcpu) 5652 { 5653 5654 return nested_vmx_run(vcpu, false); 5655 } 5656 5657 static int handle_vmread(struct kvm_vcpu *vcpu) 5658 { 5659 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5660 : get_vmcs12(vcpu); 5661 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5662 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5663 struct vcpu_vmx *vmx = to_vmx(vcpu); 5664 struct x86_exception e; 5665 unsigned long field; 5666 u64 value; 5667 gva_t gva = 0; 5668 short offset; 5669 int len, r; 5670 5671 if (!nested_vmx_check_permission(vcpu)) 5672 return 1; 5673 5674 /* Decode instruction info and find the field to read */ 5675 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5676 5677 if (!nested_vmx_is_evmptr12_valid(vmx)) { 5678 /* 5679 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5680 * any VMREAD sets the ALU flags for VMfailInvalid. 5681 */ 5682 if (vmx->nested.current_vmptr == INVALID_GPA || 5683 (is_guest_mode(vcpu) && 5684 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5685 return nested_vmx_failInvalid(vcpu); 5686 5687 offset = get_vmcs12_field_offset(field); 5688 if (offset < 0) 5689 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5690 5691 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5692 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5693 5694 /* Read the field, zero-extended to a u64 value */ 5695 value = vmcs12_read_any(vmcs12, field, offset); 5696 } else { 5697 /* 5698 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5699 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5700 * unsupported. Unfortunately, certain versions of Windows 11 5701 * don't comply with this requirement which is not enforced in 5702 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5703 * workaround, as misbehaving guests will panic on VM-Fail. 5704 * Note, enlightened VMCS is incompatible with shadow VMCS so 5705 * all VMREADs from L2 should go to L1. 5706 */ 5707 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5708 return nested_vmx_failInvalid(vcpu); 5709 5710 offset = evmcs_field_offset(field, NULL); 5711 if (offset < 0) 5712 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5713 5714 /* Read the field, zero-extended to a u64 value */ 5715 value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); 5716 } 5717 5718 /* 5719 * Now copy part of this value to register or memory, as requested. 5720 * Note that the number of bits actually copied is 32 or 64 depending 5721 * on the guest's mode (32 or 64 bit), not on the given field's length. 5722 */ 5723 if (instr_info & BIT(10)) { 5724 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5725 } else { 5726 len = is_64_bit_mode(vcpu) ? 8 : 4; 5727 if (get_vmx_mem_address(vcpu, exit_qualification, 5728 instr_info, true, len, &gva)) 5729 return 1; 5730 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5731 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5732 if (r != X86EMUL_CONTINUE) 5733 return kvm_handle_memory_failure(vcpu, r, &e); 5734 } 5735 5736 return nested_vmx_succeed(vcpu); 5737 } 5738 5739 static bool is_shadow_field_rw(unsigned long field) 5740 { 5741 switch (field) { 5742 #define SHADOW_FIELD_RW(x, y) case x: 5743 #include "vmcs_shadow_fields.h" 5744 return true; 5745 default: 5746 break; 5747 } 5748 return false; 5749 } 5750 5751 static bool is_shadow_field_ro(unsigned long field) 5752 { 5753 switch (field) { 5754 #define SHADOW_FIELD_RO(x, y) case x: 5755 #include "vmcs_shadow_fields.h" 5756 return true; 5757 default: 5758 break; 5759 } 5760 return false; 5761 } 5762 5763 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5764 { 5765 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5766 : get_vmcs12(vcpu); 5767 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5768 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5769 struct vcpu_vmx *vmx = to_vmx(vcpu); 5770 struct x86_exception e; 5771 unsigned long field; 5772 short offset; 5773 gva_t gva; 5774 int len, r; 5775 5776 /* 5777 * The value to write might be 32 or 64 bits, depending on L1's long 5778 * mode, and eventually we need to write that into a field of several 5779 * possible lengths. The code below first zero-extends the value to 64 5780 * bit (value), and then copies only the appropriate number of 5781 * bits into the vmcs12 field. 5782 */ 5783 u64 value = 0; 5784 5785 if (!nested_vmx_check_permission(vcpu)) 5786 return 1; 5787 5788 /* 5789 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5790 * any VMWRITE sets the ALU flags for VMfailInvalid. 5791 */ 5792 if (vmx->nested.current_vmptr == INVALID_GPA || 5793 (is_guest_mode(vcpu) && 5794 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5795 return nested_vmx_failInvalid(vcpu); 5796 5797 if (instr_info & BIT(10)) 5798 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5799 else { 5800 len = is_64_bit_mode(vcpu) ? 8 : 4; 5801 if (get_vmx_mem_address(vcpu, exit_qualification, 5802 instr_info, false, len, &gva)) 5803 return 1; 5804 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5805 if (r != X86EMUL_CONTINUE) 5806 return kvm_handle_memory_failure(vcpu, r, &e); 5807 } 5808 5809 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5810 5811 offset = get_vmcs12_field_offset(field); 5812 if (offset < 0) 5813 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5814 5815 /* 5816 * If the vCPU supports "VMWRITE to any supported field in the 5817 * VMCS," then the "read-only" fields are actually read/write. 5818 */ 5819 if (vmcs_field_readonly(field) && 5820 !nested_cpu_has_vmwrite_any_field(vcpu)) 5821 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5822 5823 /* 5824 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5825 * vmcs12, else we may crush a field or consume a stale value. 5826 */ 5827 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5828 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5829 5830 /* 5831 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5832 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5833 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5834 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5835 * from L1 will return a different value than VMREAD from L2 (L1 sees 5836 * the stripped down value, L2 sees the full value as stored by KVM). 5837 */ 5838 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5839 value &= 0x1f0ff; 5840 5841 vmcs12_write_any(vmcs12, field, offset, value); 5842 5843 /* 5844 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5845 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5846 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5847 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5848 */ 5849 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5850 /* 5851 * L1 can read these fields without exiting, ensure the 5852 * shadow VMCS is up-to-date. 5853 */ 5854 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5855 preempt_disable(); 5856 vmcs_load(vmx->vmcs01.shadow_vmcs); 5857 5858 __vmcs_writel(field, value); 5859 5860 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5861 vmcs_load(vmx->loaded_vmcs->vmcs); 5862 preempt_enable(); 5863 } 5864 vmx->nested.dirty_vmcs12 = true; 5865 } 5866 5867 return nested_vmx_succeed(vcpu); 5868 } 5869 5870 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5871 { 5872 vmx->nested.current_vmptr = vmptr; 5873 if (enable_shadow_vmcs) { 5874 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5875 vmcs_write64(VMCS_LINK_POINTER, 5876 __pa(vmx->vmcs01.shadow_vmcs)); 5877 vmx->nested.need_vmcs12_to_shadow_sync = true; 5878 } 5879 vmx->nested.dirty_vmcs12 = true; 5880 vmx->nested.force_msr_bitmap_recalc = true; 5881 } 5882 5883 /* Emulate the VMPTRLD instruction */ 5884 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5885 { 5886 struct vcpu_vmx *vmx = to_vmx(vcpu); 5887 gpa_t vmptr; 5888 int r; 5889 5890 if (!nested_vmx_check_permission(vcpu)) 5891 return 1; 5892 5893 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5894 return r; 5895 5896 if (!page_address_valid(vcpu, vmptr)) 5897 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5898 5899 if (vmptr == vmx->nested.vmxon_ptr) 5900 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5901 5902 /* Forbid normal VMPTRLD if Enlightened version was used */ 5903 if (nested_vmx_is_evmptr12_valid(vmx)) 5904 return 1; 5905 5906 if (vmx->nested.current_vmptr != vmptr) { 5907 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5908 struct vmcs_hdr hdr; 5909 5910 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5911 /* 5912 * Reads from an unbacked page return all 1s, 5913 * which means that the 32 bits located at the 5914 * given physical address won't match the required 5915 * VMCS12_REVISION identifier. 5916 */ 5917 return nested_vmx_fail(vcpu, 5918 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5919 } 5920 5921 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5922 offsetof(struct vmcs12, hdr), 5923 sizeof(hdr))) { 5924 return nested_vmx_fail(vcpu, 5925 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5926 } 5927 5928 if (hdr.revision_id != VMCS12_REVISION || 5929 (hdr.shadow_vmcs && 5930 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5931 return nested_vmx_fail(vcpu, 5932 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5933 } 5934 5935 nested_release_vmcs12(vcpu); 5936 5937 /* 5938 * Load VMCS12 from guest memory since it is not already 5939 * cached. 5940 */ 5941 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5942 VMCS12_SIZE)) { 5943 return nested_vmx_fail(vcpu, 5944 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5945 } 5946 5947 set_current_vmptr(vmx, vmptr); 5948 } 5949 5950 return nested_vmx_succeed(vcpu); 5951 } 5952 5953 /* Emulate the VMPTRST instruction */ 5954 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5955 { 5956 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5957 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5958 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5959 struct x86_exception e; 5960 gva_t gva; 5961 int r; 5962 5963 if (!nested_vmx_check_permission(vcpu)) 5964 return 1; 5965 5966 if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) 5967 return 1; 5968 5969 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5970 true, sizeof(gpa_t), &gva)) 5971 return 1; 5972 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5973 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5974 sizeof(gpa_t), &e); 5975 if (r != X86EMUL_CONTINUE) 5976 return kvm_handle_memory_failure(vcpu, r, &e); 5977 5978 return nested_vmx_succeed(vcpu); 5979 } 5980 5981 /* Emulate the INVEPT instruction */ 5982 static int handle_invept(struct kvm_vcpu *vcpu) 5983 { 5984 struct vcpu_vmx *vmx = to_vmx(vcpu); 5985 u32 vmx_instruction_info, types; 5986 unsigned long type, roots_to_free; 5987 struct kvm_mmu *mmu; 5988 gva_t gva; 5989 struct x86_exception e; 5990 struct { 5991 u64 eptp, gpa; 5992 } operand; 5993 int i, r, gpr_index; 5994 5995 if (!(vmx->nested.msrs.secondary_ctls_high & 5996 SECONDARY_EXEC_ENABLE_EPT) || 5997 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5998 kvm_queue_exception(vcpu, UD_VECTOR); 5999 return 1; 6000 } 6001 6002 if (!nested_vmx_check_permission(vcpu)) 6003 return 1; 6004 6005 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6006 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 6007 type = kvm_register_read(vcpu, gpr_index); 6008 6009 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 6010 6011 if (type >= 32 || !(types & (1 << type))) 6012 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6013 6014 /* According to the Intel VMX instruction reference, the memory 6015 * operand is read even if it isn't needed (e.g., for type==global) 6016 */ 6017 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6018 vmx_instruction_info, false, sizeof(operand), &gva)) 6019 return 1; 6020 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 6021 if (r != X86EMUL_CONTINUE) 6022 return kvm_handle_memory_failure(vcpu, r, &e); 6023 6024 /* 6025 * Nested EPT roots are always held through guest_mmu, 6026 * not root_mmu. 6027 */ 6028 mmu = &vcpu->arch.guest_mmu; 6029 6030 switch (type) { 6031 case VMX_EPT_EXTENT_CONTEXT: 6032 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 6033 return nested_vmx_fail(vcpu, 6034 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6035 6036 roots_to_free = 0; 6037 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 6038 operand.eptp)) 6039 roots_to_free |= KVM_MMU_ROOT_CURRENT; 6040 6041 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 6042 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 6043 mmu->prev_roots[i].pgd, 6044 operand.eptp)) 6045 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 6046 } 6047 break; 6048 case VMX_EPT_EXTENT_GLOBAL: 6049 roots_to_free = KVM_MMU_ROOTS_ALL; 6050 break; 6051 default: 6052 BUG(); 6053 break; 6054 } 6055 6056 if (roots_to_free) 6057 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 6058 6059 return nested_vmx_succeed(vcpu); 6060 } 6061 6062 static int handle_invvpid(struct kvm_vcpu *vcpu) 6063 { 6064 struct vcpu_vmx *vmx = to_vmx(vcpu); 6065 u32 vmx_instruction_info; 6066 unsigned long type, types; 6067 gva_t gva; 6068 struct x86_exception e; 6069 struct { 6070 u64 vpid; 6071 u64 gla; 6072 } operand; 6073 u16 vpid02; 6074 int r, gpr_index; 6075 6076 if (!(vmx->nested.msrs.secondary_ctls_high & 6077 SECONDARY_EXEC_ENABLE_VPID) || 6078 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 6079 kvm_queue_exception(vcpu, UD_VECTOR); 6080 return 1; 6081 } 6082 6083 if (!nested_vmx_check_permission(vcpu)) 6084 return 1; 6085 6086 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6087 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 6088 type = kvm_register_read(vcpu, gpr_index); 6089 6090 types = (vmx->nested.msrs.vpid_caps & 6091 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 6092 6093 if (type >= 32 || !(types & (1 << type))) 6094 return nested_vmx_fail(vcpu, 6095 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6096 6097 /* according to the intel vmx instruction reference, the memory 6098 * operand is read even if it isn't needed (e.g., for type==global) 6099 */ 6100 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6101 vmx_instruction_info, false, sizeof(operand), &gva)) 6102 return 1; 6103 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 6104 if (r != X86EMUL_CONTINUE) 6105 return kvm_handle_memory_failure(vcpu, r, &e); 6106 6107 if (operand.vpid >> 16) 6108 return nested_vmx_fail(vcpu, 6109 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6110 6111 /* 6112 * Always flush the effective vpid02, i.e. never flush the current VPID 6113 * and never explicitly flush vpid01. INVVPID targets a VPID, not a 6114 * VMCS, and so whether or not the current vmcs12 has VPID enabled is 6115 * irrelevant (and there may not be a loaded vmcs12). 6116 */ 6117 vpid02 = nested_get_vpid02(vcpu); 6118 switch (type) { 6119 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 6120 /* 6121 * LAM doesn't apply to addresses that are inputs to TLB 6122 * invalidation. 6123 */ 6124 if (!operand.vpid || 6125 is_noncanonical_invlpg_address(operand.gla, vcpu)) 6126 return nested_vmx_fail(vcpu, 6127 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6128 vpid_sync_vcpu_addr(vpid02, operand.gla); 6129 break; 6130 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 6131 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 6132 if (!operand.vpid) 6133 return nested_vmx_fail(vcpu, 6134 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6135 vpid_sync_context(vpid02); 6136 break; 6137 case VMX_VPID_EXTENT_ALL_CONTEXT: 6138 vpid_sync_context(vpid02); 6139 break; 6140 default: 6141 WARN_ON_ONCE(1); 6142 return kvm_skip_emulated_instruction(vcpu); 6143 } 6144 6145 /* 6146 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 6147 * linear mappings for L2 (tagged with L2's VPID). Free all guest 6148 * roots as VPIDs are not tracked in the MMU role. 6149 * 6150 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 6151 * an MMU when EPT is disabled. 6152 * 6153 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 6154 */ 6155 if (!enable_ept) 6156 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 6157 6158 return nested_vmx_succeed(vcpu); 6159 } 6160 6161 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 6162 struct vmcs12 *vmcs12) 6163 { 6164 u32 index = kvm_rcx_read(vcpu); 6165 u64 new_eptp; 6166 6167 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 6168 return 1; 6169 if (index >= VMFUNC_EPTP_ENTRIES) 6170 return 1; 6171 6172 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 6173 &new_eptp, index * 8, 8)) 6174 return 1; 6175 6176 /* 6177 * If the (L2) guest does a vmfunc to the currently 6178 * active ept pointer, we don't have to do anything else 6179 */ 6180 if (vmcs12->ept_pointer != new_eptp) { 6181 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 6182 return 1; 6183 6184 vmcs12->ept_pointer = new_eptp; 6185 nested_ept_new_eptp(vcpu); 6186 6187 if (!nested_cpu_has_vpid(vmcs12)) 6188 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 6189 } 6190 6191 return 0; 6192 } 6193 6194 static int handle_vmfunc(struct kvm_vcpu *vcpu) 6195 { 6196 struct vcpu_vmx *vmx = to_vmx(vcpu); 6197 struct vmcs12 *vmcs12; 6198 u32 function = kvm_rax_read(vcpu); 6199 6200 /* 6201 * VMFUNC should never execute cleanly while L1 is active; KVM supports 6202 * VMFUNC for nested VMs, but not for L1. 6203 */ 6204 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 6205 kvm_queue_exception(vcpu, UD_VECTOR); 6206 return 1; 6207 } 6208 6209 vmcs12 = get_vmcs12(vcpu); 6210 6211 /* 6212 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 6213 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 6214 */ 6215 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 6216 kvm_queue_exception(vcpu, UD_VECTOR); 6217 return 1; 6218 } 6219 6220 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 6221 goto fail; 6222 6223 switch (function) { 6224 case 0: 6225 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 6226 goto fail; 6227 break; 6228 default: 6229 goto fail; 6230 } 6231 return kvm_skip_emulated_instruction(vcpu); 6232 6233 fail: 6234 /* 6235 * This is effectively a reflected VM-Exit, as opposed to a synthesized 6236 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 6237 * EXIT_REASON_VMFUNC as the exit reason. 6238 */ 6239 nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, 6240 vmx_get_intr_info(vcpu), 6241 vmx_get_exit_qual(vcpu)); 6242 return 1; 6243 } 6244 6245 /* 6246 * Return true if an IO instruction with the specified port and size should cause 6247 * a VM-exit into L1. 6248 */ 6249 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 6250 int size) 6251 { 6252 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6253 gpa_t bitmap, last_bitmap; 6254 u8 b; 6255 6256 last_bitmap = INVALID_GPA; 6257 b = -1; 6258 6259 while (size > 0) { 6260 if (port < 0x8000) 6261 bitmap = vmcs12->io_bitmap_a; 6262 else if (port < 0x10000) 6263 bitmap = vmcs12->io_bitmap_b; 6264 else 6265 return true; 6266 bitmap += (port & 0x7fff) / 8; 6267 6268 if (last_bitmap != bitmap) 6269 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 6270 return true; 6271 if (b & (1 << (port & 7))) 6272 return true; 6273 6274 port++; 6275 size--; 6276 last_bitmap = bitmap; 6277 } 6278 6279 return false; 6280 } 6281 6282 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 6283 struct vmcs12 *vmcs12) 6284 { 6285 unsigned long exit_qualification; 6286 unsigned short port; 6287 int size; 6288 6289 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6290 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6291 6292 exit_qualification = vmx_get_exit_qual(vcpu); 6293 6294 port = exit_qualification >> 16; 6295 size = (exit_qualification & 7) + 1; 6296 6297 return nested_vmx_check_io_bitmaps(vcpu, port, size); 6298 } 6299 6300 /* 6301 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 6302 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6303 * disinterest in the current event (read or write a specific MSR) by using an 6304 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6305 */ 6306 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6307 struct vmcs12 *vmcs12, 6308 union vmx_exit_reason exit_reason) 6309 { 6310 u32 msr_index; 6311 gpa_t bitmap; 6312 6313 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6314 return true; 6315 6316 if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM || 6317 exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6318 msr_index = vmx_get_exit_qual(vcpu); 6319 else 6320 msr_index = kvm_rcx_read(vcpu); 6321 6322 /* 6323 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6324 * for the four combinations of read/write and low/high MSR numbers. 6325 * First we need to figure out which of the four to use: 6326 */ 6327 bitmap = vmcs12->msr_bitmap; 6328 if (exit_reason.basic == EXIT_REASON_MSR_WRITE || 6329 exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6330 bitmap += 2048; 6331 if (msr_index >= 0xc0000000) { 6332 msr_index -= 0xc0000000; 6333 bitmap += 1024; 6334 } 6335 6336 /* Then read the msr_index'th bit from this bitmap: */ 6337 if (msr_index < 1024*8) { 6338 unsigned char b; 6339 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6340 return true; 6341 return 1 & (b >> (msr_index & 7)); 6342 } else 6343 return true; /* let L1 handle the wrong parameter */ 6344 } 6345 6346 /* 6347 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6348 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6349 * intercept (via guest_host_mask etc.) the current event. 6350 */ 6351 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6352 struct vmcs12 *vmcs12) 6353 { 6354 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6355 int cr = exit_qualification & 15; 6356 int reg; 6357 unsigned long val; 6358 6359 switch ((exit_qualification >> 4) & 3) { 6360 case 0: /* mov to cr */ 6361 reg = (exit_qualification >> 8) & 15; 6362 val = kvm_register_read(vcpu, reg); 6363 switch (cr) { 6364 case 0: 6365 if (vmcs12->cr0_guest_host_mask & 6366 (val ^ vmcs12->cr0_read_shadow)) 6367 return true; 6368 break; 6369 case 3: 6370 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6371 return true; 6372 break; 6373 case 4: 6374 if (vmcs12->cr4_guest_host_mask & 6375 (vmcs12->cr4_read_shadow ^ val)) 6376 return true; 6377 break; 6378 case 8: 6379 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6380 return true; 6381 break; 6382 } 6383 break; 6384 case 2: /* clts */ 6385 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6386 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6387 return true; 6388 break; 6389 case 1: /* mov from cr */ 6390 switch (cr) { 6391 case 3: 6392 if (vmcs12->cpu_based_vm_exec_control & 6393 CPU_BASED_CR3_STORE_EXITING) 6394 return true; 6395 break; 6396 case 8: 6397 if (vmcs12->cpu_based_vm_exec_control & 6398 CPU_BASED_CR8_STORE_EXITING) 6399 return true; 6400 break; 6401 } 6402 break; 6403 case 3: /* lmsw */ 6404 /* 6405 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6406 * cr0. Other attempted changes are ignored, with no exit. 6407 */ 6408 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6409 if (vmcs12->cr0_guest_host_mask & 0xe & 6410 (val ^ vmcs12->cr0_read_shadow)) 6411 return true; 6412 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6413 !(vmcs12->cr0_read_shadow & 0x1) && 6414 (val & 0x1)) 6415 return true; 6416 break; 6417 } 6418 return false; 6419 } 6420 6421 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6422 struct vmcs12 *vmcs12) 6423 { 6424 u32 encls_leaf; 6425 6426 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || 6427 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6428 return false; 6429 6430 encls_leaf = kvm_rax_read(vcpu); 6431 if (encls_leaf > 62) 6432 encls_leaf = 63; 6433 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6434 } 6435 6436 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6437 struct vmcs12 *vmcs12, gpa_t bitmap) 6438 { 6439 u32 vmx_instruction_info; 6440 unsigned long field; 6441 u8 b; 6442 6443 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6444 return true; 6445 6446 /* Decode instruction info and find the field to access */ 6447 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6448 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6449 6450 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6451 if (field >> 15) 6452 return true; 6453 6454 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6455 return true; 6456 6457 return 1 & (b >> (field & 7)); 6458 } 6459 6460 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6461 { 6462 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6463 6464 if (nested_cpu_has_mtf(vmcs12)) 6465 return true; 6466 6467 /* 6468 * An MTF VM-exit may be injected into the guest by setting the 6469 * interruption-type to 7 (other event) and the vector field to 0. Such 6470 * is the case regardless of the 'monitor trap flag' VM-execution 6471 * control. 6472 */ 6473 return entry_intr_info == (INTR_INFO_VALID_MASK 6474 | INTR_TYPE_OTHER_EVENT); 6475 } 6476 6477 /* 6478 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6479 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6480 */ 6481 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6482 union vmx_exit_reason exit_reason) 6483 { 6484 u32 intr_info; 6485 6486 switch ((u16)exit_reason.basic) { 6487 case EXIT_REASON_EXCEPTION_NMI: 6488 intr_info = vmx_get_intr_info(vcpu); 6489 if (is_nmi(intr_info)) 6490 return true; 6491 else if (is_page_fault(intr_info)) 6492 return vcpu->arch.apf.host_apf_flags || 6493 vmx_need_pf_intercept(vcpu); 6494 else if (is_debug(intr_info) && 6495 vcpu->guest_debug & 6496 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6497 return true; 6498 else if (is_breakpoint(intr_info) && 6499 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6500 return true; 6501 else if (is_alignment_check(intr_info) && 6502 !vmx_guest_inject_ac(vcpu)) 6503 return true; 6504 else if (is_ve_fault(intr_info)) 6505 return true; 6506 return false; 6507 case EXIT_REASON_EXTERNAL_INTERRUPT: 6508 return true; 6509 case EXIT_REASON_MCE_DURING_VMENTRY: 6510 return true; 6511 case EXIT_REASON_EPT_VIOLATION: 6512 /* 6513 * L0 always deals with the EPT violation. If nested EPT is 6514 * used, and the nested mmu code discovers that the address is 6515 * missing in the guest EPT table (EPT12), the EPT violation 6516 * will be injected with nested_ept_inject_page_fault() 6517 */ 6518 return true; 6519 case EXIT_REASON_EPT_MISCONFIG: 6520 /* 6521 * L2 never uses directly L1's EPT, but rather L0's own EPT 6522 * table (shadow on EPT) or a merged EPT table that L0 built 6523 * (EPT on EPT). So any problems with the structure of the 6524 * table is L0's fault. 6525 */ 6526 return true; 6527 case EXIT_REASON_PREEMPTION_TIMER: 6528 return true; 6529 case EXIT_REASON_PML_FULL: 6530 /* 6531 * PML is emulated for an L1 VMM and should never be enabled in 6532 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6533 */ 6534 return true; 6535 case EXIT_REASON_VMFUNC: 6536 /* VM functions are emulated through L2->L0 vmexits. */ 6537 return true; 6538 case EXIT_REASON_BUS_LOCK: 6539 /* 6540 * At present, bus lock VM exit is never exposed to L1. 6541 * Handle L2's bus locks in L0 directly. 6542 */ 6543 return true; 6544 #ifdef CONFIG_KVM_HYPERV 6545 case EXIT_REASON_VMCALL: 6546 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6547 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6548 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6549 kvm_hv_is_tlb_flush_hcall(vcpu); 6550 #endif 6551 default: 6552 break; 6553 } 6554 return false; 6555 } 6556 6557 /* 6558 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6559 * is_guest_mode (L2). 6560 */ 6561 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6562 union vmx_exit_reason exit_reason) 6563 { 6564 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6565 u32 intr_info; 6566 6567 switch ((u16)exit_reason.basic) { 6568 case EXIT_REASON_EXCEPTION_NMI: 6569 intr_info = vmx_get_intr_info(vcpu); 6570 if (is_nmi(intr_info)) 6571 return true; 6572 else if (is_page_fault(intr_info)) 6573 return true; 6574 return vmcs12->exception_bitmap & 6575 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6576 case EXIT_REASON_EXTERNAL_INTERRUPT: 6577 return nested_exit_on_intr(vcpu); 6578 case EXIT_REASON_TRIPLE_FAULT: 6579 return true; 6580 case EXIT_REASON_INTERRUPT_WINDOW: 6581 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6582 case EXIT_REASON_NMI_WINDOW: 6583 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6584 case EXIT_REASON_TASK_SWITCH: 6585 return true; 6586 case EXIT_REASON_CPUID: 6587 return true; 6588 case EXIT_REASON_HLT: 6589 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6590 case EXIT_REASON_INVD: 6591 return true; 6592 case EXIT_REASON_INVLPG: 6593 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6594 case EXIT_REASON_RDPMC: 6595 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6596 case EXIT_REASON_RDRAND: 6597 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6598 case EXIT_REASON_RDSEED: 6599 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6600 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6601 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6602 case EXIT_REASON_VMREAD: 6603 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6604 vmcs12->vmread_bitmap); 6605 case EXIT_REASON_VMWRITE: 6606 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6607 vmcs12->vmwrite_bitmap); 6608 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6609 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6610 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6611 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6612 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6613 /* 6614 * VMX instructions trap unconditionally. This allows L1 to 6615 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6616 */ 6617 return true; 6618 case EXIT_REASON_CR_ACCESS: 6619 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6620 case EXIT_REASON_DR_ACCESS: 6621 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6622 case EXIT_REASON_IO_INSTRUCTION: 6623 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6624 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6625 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6626 case EXIT_REASON_MSR_READ: 6627 case EXIT_REASON_MSR_WRITE: 6628 case EXIT_REASON_MSR_READ_IMM: 6629 case EXIT_REASON_MSR_WRITE_IMM: 6630 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6631 case EXIT_REASON_INVALID_STATE: 6632 return true; 6633 case EXIT_REASON_MWAIT_INSTRUCTION: 6634 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6635 case EXIT_REASON_MONITOR_TRAP_FLAG: 6636 return nested_vmx_exit_handled_mtf(vmcs12); 6637 case EXIT_REASON_MONITOR_INSTRUCTION: 6638 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6639 case EXIT_REASON_PAUSE_INSTRUCTION: 6640 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6641 nested_cpu_has2(vmcs12, 6642 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6643 case EXIT_REASON_MCE_DURING_VMENTRY: 6644 return true; 6645 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6646 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6647 case EXIT_REASON_APIC_ACCESS: 6648 case EXIT_REASON_APIC_WRITE: 6649 case EXIT_REASON_EOI_INDUCED: 6650 /* 6651 * The controls for "virtualize APIC accesses," "APIC- 6652 * register virtualization," and "virtual-interrupt 6653 * delivery" only come from vmcs12. 6654 */ 6655 return true; 6656 case EXIT_REASON_INVPCID: 6657 return 6658 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6659 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6660 case EXIT_REASON_WBINVD: 6661 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6662 case EXIT_REASON_XSETBV: 6663 return true; 6664 case EXIT_REASON_XSAVES: 6665 case EXIT_REASON_XRSTORS: 6666 /* 6667 * Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize 6668 * XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap 6669 * verbatim, i.e. any exit is due to L1's bitmap. WARN if 6670 * XSAVES isn't enabled, as the CPU is supposed to inject #UD 6671 * in that case, before consulting the XSS-bitmap. 6672 */ 6673 WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES)); 6674 return true; 6675 case EXIT_REASON_UMWAIT: 6676 case EXIT_REASON_TPAUSE: 6677 return nested_cpu_has2(vmcs12, 6678 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6679 case EXIT_REASON_ENCLS: 6680 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6681 case EXIT_REASON_NOTIFY: 6682 /* Notify VM exit is not exposed to L1 */ 6683 return false; 6684 case EXIT_REASON_SEAMCALL: 6685 case EXIT_REASON_TDCALL: 6686 /* 6687 * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't 6688 * virtualized by KVM for L1 hypervisors, i.e. L1 should 6689 * never want or expect such an exit. 6690 */ 6691 return false; 6692 default: 6693 return true; 6694 } 6695 } 6696 6697 /* 6698 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6699 * reflected into L1. 6700 */ 6701 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6702 { 6703 struct vcpu_vmx *vmx = to_vmx(vcpu); 6704 union vmx_exit_reason exit_reason = vmx->vt.exit_reason; 6705 unsigned long exit_qual; 6706 u32 exit_intr_info; 6707 6708 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6709 6710 /* 6711 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6712 * has already loaded L2's state. 6713 */ 6714 if (unlikely(vmx->fail)) { 6715 trace_kvm_nested_vmenter_failed( 6716 "hardware VM-instruction error: ", 6717 vmcs_read32(VM_INSTRUCTION_ERROR)); 6718 exit_intr_info = 0; 6719 exit_qual = 0; 6720 goto reflect_vmexit; 6721 } 6722 6723 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6724 6725 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6726 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6727 return false; 6728 6729 /* If L1 doesn't want the exit, handle it in L0. */ 6730 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6731 return false; 6732 6733 /* 6734 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6735 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6736 * need to be synthesized by querying the in-kernel LAPIC, but external 6737 * interrupts are never reflected to L1 so it's a non-issue. 6738 */ 6739 exit_intr_info = vmx_get_intr_info(vcpu); 6740 if (is_exception_with_error_code(exit_intr_info)) { 6741 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6742 6743 vmcs12->vm_exit_intr_error_code = 6744 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6745 } 6746 exit_qual = vmx_get_exit_qual(vcpu); 6747 6748 reflect_vmexit: 6749 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6750 return true; 6751 } 6752 6753 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6754 struct kvm_nested_state __user *user_kvm_nested_state, 6755 u32 user_data_size) 6756 { 6757 struct vcpu_vmx *vmx; 6758 struct vmcs12 *vmcs12; 6759 struct kvm_nested_state kvm_state = { 6760 .flags = 0, 6761 .format = KVM_STATE_NESTED_FORMAT_VMX, 6762 .size = sizeof(kvm_state), 6763 .hdr.vmx.flags = 0, 6764 .hdr.vmx.vmxon_pa = INVALID_GPA, 6765 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6766 .hdr.vmx.preemption_timer_deadline = 0, 6767 }; 6768 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6769 &user_kvm_nested_state->data.vmx[0]; 6770 6771 if (!vcpu) 6772 return kvm_state.size + sizeof(*user_vmx_nested_state); 6773 6774 vmx = to_vmx(vcpu); 6775 vmcs12 = get_vmcs12(vcpu); 6776 6777 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && 6778 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6779 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6780 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6781 6782 if (vmx_has_valid_vmcs12(vcpu)) { 6783 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6784 6785 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6786 if (nested_vmx_is_evmptr12_set(vmx)) 6787 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6788 6789 if (is_guest_mode(vcpu) && 6790 nested_cpu_has_shadow_vmcs(vmcs12) && 6791 vmcs12->vmcs_link_pointer != INVALID_GPA) 6792 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6793 } 6794 6795 if (vmx->nested.smm.vmxon) 6796 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6797 6798 if (vmx->nested.smm.guest_mode) 6799 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6800 6801 if (is_guest_mode(vcpu)) { 6802 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6803 6804 if (vmx->nested.nested_run_pending) 6805 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6806 6807 if (vmx->nested.mtf_pending) 6808 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6809 6810 if (nested_cpu_has_preemption_timer(vmcs12) && 6811 vmx->nested.has_preemption_timer_deadline) { 6812 kvm_state.hdr.vmx.flags |= 6813 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6814 kvm_state.hdr.vmx.preemption_timer_deadline = 6815 vmx->nested.preemption_timer_deadline; 6816 } 6817 } 6818 } 6819 6820 if (user_data_size < kvm_state.size) 6821 goto out; 6822 6823 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6824 return -EFAULT; 6825 6826 if (!vmx_has_valid_vmcs12(vcpu)) 6827 goto out; 6828 6829 /* 6830 * When running L2, the authoritative vmcs12 state is in the 6831 * vmcs02. When running L1, the authoritative vmcs12 state is 6832 * in the shadow or enlightened vmcs linked to vmcs01, unless 6833 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6834 * vmcs12 state is in the vmcs12 already. 6835 */ 6836 if (is_guest_mode(vcpu)) { 6837 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6838 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6839 } else { 6840 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6841 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6842 if (nested_vmx_is_evmptr12_valid(vmx)) 6843 /* 6844 * L1 hypervisor is not obliged to keep eVMCS 6845 * clean fields data always up-to-date while 6846 * not in guest mode, 'hv_clean_fields' is only 6847 * supposed to be actual upon vmentry so we need 6848 * to ignore it here and do full copy. 6849 */ 6850 copy_enlightened_to_vmcs12(vmx, 0); 6851 else if (enable_shadow_vmcs) 6852 copy_shadow_to_vmcs12(vmx); 6853 } 6854 } 6855 6856 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6857 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6858 6859 /* 6860 * Copy over the full allocated size of vmcs12 rather than just the size 6861 * of the struct. 6862 */ 6863 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6864 return -EFAULT; 6865 6866 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6867 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6868 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6869 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6870 return -EFAULT; 6871 } 6872 out: 6873 return kvm_state.size; 6874 } 6875 6876 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6877 { 6878 if (is_guest_mode(vcpu)) { 6879 to_vmx(vcpu)->nested.nested_run_pending = 0; 6880 nested_vmx_vmexit(vcpu, -1, 0, 0); 6881 } 6882 free_nested(vcpu); 6883 } 6884 6885 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6886 struct kvm_nested_state __user *user_kvm_nested_state, 6887 struct kvm_nested_state *kvm_state) 6888 { 6889 struct vcpu_vmx *vmx = to_vmx(vcpu); 6890 struct vmcs12 *vmcs12; 6891 enum vm_entry_failure_code ignored; 6892 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6893 &user_kvm_nested_state->data.vmx[0]; 6894 int ret; 6895 6896 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6897 return -EINVAL; 6898 6899 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6900 if (kvm_state->hdr.vmx.smm.flags) 6901 return -EINVAL; 6902 6903 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6904 return -EINVAL; 6905 6906 /* 6907 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6908 * enable eVMCS capability on vCPU. However, since then 6909 * code was changed such that flag signals vmcs12 should 6910 * be copied into eVMCS in guest memory. 6911 * 6912 * To preserve backwards compatibility, allow user 6913 * to set this flag even when there is no VMXON region. 6914 */ 6915 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6916 return -EINVAL; 6917 } else { 6918 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 6919 return -EINVAL; 6920 6921 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6922 return -EINVAL; 6923 } 6924 6925 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6926 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6927 return -EINVAL; 6928 6929 if (kvm_state->hdr.vmx.smm.flags & 6930 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6931 return -EINVAL; 6932 6933 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6934 return -EINVAL; 6935 6936 /* 6937 * SMM temporarily disables VMX, so we cannot be in guest mode, 6938 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6939 * must be zero. 6940 */ 6941 if (is_smm(vcpu) ? 6942 (kvm_state->flags & 6943 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6944 : kvm_state->hdr.vmx.smm.flags) 6945 return -EINVAL; 6946 6947 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6948 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6949 return -EINVAL; 6950 6951 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6952 (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || 6953 !vmx->nested.enlightened_vmcs_enabled)) 6954 return -EINVAL; 6955 6956 vmx_leave_nested(vcpu); 6957 6958 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6959 return 0; 6960 6961 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6962 ret = enter_vmx_operation(vcpu); 6963 if (ret) 6964 return ret; 6965 6966 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6967 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6968 /* See vmx_has_valid_vmcs12. */ 6969 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6970 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6971 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6972 return -EINVAL; 6973 else 6974 return 0; 6975 } 6976 6977 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6978 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6979 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6980 return -EINVAL; 6981 6982 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6983 #ifdef CONFIG_KVM_HYPERV 6984 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6985 /* 6986 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6987 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6988 * restored yet. EVMCS will be mapped from 6989 * nested_get_vmcs12_pages(). 6990 */ 6991 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6992 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6993 #endif 6994 } else { 6995 return -EINVAL; 6996 } 6997 6998 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6999 vmx->nested.smm.vmxon = true; 7000 vmx->nested.vmxon = false; 7001 7002 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 7003 vmx->nested.smm.guest_mode = true; 7004 } 7005 7006 vmcs12 = get_vmcs12(vcpu); 7007 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 7008 return -EFAULT; 7009 7010 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 7011 return -EINVAL; 7012 7013 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 7014 return 0; 7015 7016 vmx->nested.nested_run_pending = 7017 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 7018 7019 vmx->nested.mtf_pending = 7020 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 7021 7022 ret = -EINVAL; 7023 if (nested_cpu_has_shadow_vmcs(vmcs12) && 7024 vmcs12->vmcs_link_pointer != INVALID_GPA) { 7025 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 7026 7027 if (kvm_state->size < 7028 sizeof(*kvm_state) + 7029 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 7030 goto error_guest_mode; 7031 7032 if (copy_from_user(shadow_vmcs12, 7033 user_vmx_nested_state->shadow_vmcs12, 7034 sizeof(*shadow_vmcs12))) { 7035 ret = -EFAULT; 7036 goto error_guest_mode; 7037 } 7038 7039 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 7040 !shadow_vmcs12->hdr.shadow_vmcs) 7041 goto error_guest_mode; 7042 } 7043 7044 vmx->nested.has_preemption_timer_deadline = false; 7045 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 7046 vmx->nested.has_preemption_timer_deadline = true; 7047 vmx->nested.preemption_timer_deadline = 7048 kvm_state->hdr.vmx.preemption_timer_deadline; 7049 } 7050 7051 if (nested_vmx_check_controls(vcpu, vmcs12) || 7052 nested_vmx_check_host_state(vcpu, vmcs12) || 7053 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 7054 goto error_guest_mode; 7055 7056 vmx->nested.dirty_vmcs12 = true; 7057 vmx->nested.force_msr_bitmap_recalc = true; 7058 ret = nested_vmx_enter_non_root_mode(vcpu, false); 7059 if (ret) 7060 goto error_guest_mode; 7061 7062 if (vmx->nested.mtf_pending) 7063 kvm_make_request(KVM_REQ_EVENT, vcpu); 7064 7065 return 0; 7066 7067 error_guest_mode: 7068 vmx->nested.nested_run_pending = 0; 7069 return ret; 7070 } 7071 7072 void nested_vmx_set_vmcs_shadowing_bitmap(void) 7073 { 7074 if (enable_shadow_vmcs) { 7075 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 7076 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 7077 } 7078 } 7079 7080 static u64 nested_vmx_calc_vmcs_enum_msr(void) 7081 { 7082 /* 7083 * Note these are the so called "index" of the VMCS field encoding, not 7084 * the index into vmcs12. 7085 */ 7086 unsigned int max_idx, idx; 7087 int i; 7088 7089 /* 7090 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 7091 * vmcs12, regardless of whether or not the associated feature is 7092 * exposed to L1. Simply find the field with the highest index. 7093 */ 7094 max_idx = 0; 7095 for (i = 0; i < nr_vmcs12_fields; i++) { 7096 /* The vmcs12 table is very, very sparsely populated. */ 7097 if (!vmcs12_field_offsets[i]) 7098 continue; 7099 7100 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 7101 if (idx > max_idx) 7102 max_idx = idx; 7103 } 7104 7105 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 7106 } 7107 7108 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 7109 struct nested_vmx_msrs *msrs) 7110 { 7111 msrs->pinbased_ctls_low = 7112 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7113 7114 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 7115 msrs->pinbased_ctls_high &= 7116 PIN_BASED_EXT_INTR_MASK | 7117 PIN_BASED_NMI_EXITING | 7118 PIN_BASED_VIRTUAL_NMIS | 7119 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 7120 msrs->pinbased_ctls_high |= 7121 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7122 PIN_BASED_VMX_PREEMPTION_TIMER; 7123 } 7124 7125 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 7126 struct nested_vmx_msrs *msrs) 7127 { 7128 msrs->exit_ctls_low = 7129 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 7130 7131 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 7132 msrs->exit_ctls_high &= 7133 #ifdef CONFIG_X86_64 7134 VM_EXIT_HOST_ADDR_SPACE_SIZE | 7135 #endif 7136 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 7137 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE; 7138 msrs->exit_ctls_high |= 7139 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 7140 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 7141 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 7142 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 7143 7144 if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7145 !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7146 msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE; 7147 7148 /* We support free control of debug control saving. */ 7149 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 7150 } 7151 7152 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 7153 struct nested_vmx_msrs *msrs) 7154 { 7155 msrs->entry_ctls_low = 7156 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 7157 7158 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 7159 msrs->entry_ctls_high &= 7160 #ifdef CONFIG_X86_64 7161 VM_ENTRY_IA32E_MODE | 7162 #endif 7163 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 7164 VM_ENTRY_LOAD_CET_STATE; 7165 msrs->entry_ctls_high |= 7166 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 7167 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 7168 7169 if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7170 !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7171 msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE; 7172 7173 /* We support free control of debug control loading. */ 7174 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 7175 } 7176 7177 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 7178 struct nested_vmx_msrs *msrs) 7179 { 7180 msrs->procbased_ctls_low = 7181 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7182 7183 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 7184 msrs->procbased_ctls_high &= 7185 CPU_BASED_INTR_WINDOW_EXITING | 7186 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 7187 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 7188 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 7189 CPU_BASED_CR3_STORE_EXITING | 7190 #ifdef CONFIG_X86_64 7191 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 7192 #endif 7193 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 7194 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 7195 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 7196 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 7197 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 7198 /* 7199 * We can allow some features even when not supported by the 7200 * hardware. For example, L1 can specify an MSR bitmap - and we 7201 * can use it to avoid exits to L1 - even when L0 runs L2 7202 * without MSR bitmaps. 7203 */ 7204 msrs->procbased_ctls_high |= 7205 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7206 CPU_BASED_USE_MSR_BITMAPS; 7207 7208 /* We support free control of CR3 access interception. */ 7209 msrs->procbased_ctls_low &= 7210 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 7211 } 7212 7213 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 7214 struct vmcs_config *vmcs_conf, 7215 struct nested_vmx_msrs *msrs) 7216 { 7217 msrs->secondary_ctls_low = 0; 7218 7219 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 7220 msrs->secondary_ctls_high &= 7221 SECONDARY_EXEC_DESC | 7222 SECONDARY_EXEC_ENABLE_RDTSCP | 7223 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7224 SECONDARY_EXEC_WBINVD_EXITING | 7225 SECONDARY_EXEC_APIC_REGISTER_VIRT | 7226 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 7227 SECONDARY_EXEC_RDRAND_EXITING | 7228 SECONDARY_EXEC_ENABLE_INVPCID | 7229 SECONDARY_EXEC_ENABLE_VMFUNC | 7230 SECONDARY_EXEC_RDSEED_EXITING | 7231 SECONDARY_EXEC_ENABLE_XSAVES | 7232 SECONDARY_EXEC_TSC_SCALING | 7233 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 7234 7235 /* 7236 * We can emulate "VMCS shadowing," even if the hardware 7237 * doesn't support it. 7238 */ 7239 msrs->secondary_ctls_high |= 7240 SECONDARY_EXEC_SHADOW_VMCS; 7241 7242 if (enable_ept) { 7243 /* nested EPT: emulate EPT also to L1 */ 7244 msrs->secondary_ctls_high |= 7245 SECONDARY_EXEC_ENABLE_EPT; 7246 msrs->ept_caps = 7247 VMX_EPT_PAGE_WALK_4_BIT | 7248 VMX_EPT_PAGE_WALK_5_BIT | 7249 VMX_EPTP_WB_BIT | 7250 VMX_EPT_INVEPT_BIT | 7251 VMX_EPT_EXECUTE_ONLY_BIT; 7252 7253 msrs->ept_caps &= ept_caps; 7254 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 7255 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 7256 VMX_EPT_1GB_PAGE_BIT; 7257 if (enable_ept_ad_bits) { 7258 msrs->secondary_ctls_high |= 7259 SECONDARY_EXEC_ENABLE_PML; 7260 msrs->ept_caps |= VMX_EPT_AD_BIT; 7261 } 7262 7263 /* 7264 * Advertise EPTP switching irrespective of hardware support, 7265 * KVM emulates it in software so long as VMFUNC is supported. 7266 */ 7267 if (cpu_has_vmx_vmfunc()) 7268 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 7269 } 7270 7271 /* 7272 * Old versions of KVM use the single-context version without 7273 * checking for support, so declare that it is supported even 7274 * though it is treated as global context. The alternative is 7275 * not failing the single-context invvpid, and it is worse. 7276 */ 7277 if (enable_vpid) { 7278 msrs->secondary_ctls_high |= 7279 SECONDARY_EXEC_ENABLE_VPID; 7280 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 7281 VMX_VPID_EXTENT_SUPPORTED_MASK; 7282 } 7283 7284 if (enable_unrestricted_guest) 7285 msrs->secondary_ctls_high |= 7286 SECONDARY_EXEC_UNRESTRICTED_GUEST; 7287 7288 if (flexpriority_enabled) 7289 msrs->secondary_ctls_high |= 7290 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7291 7292 if (enable_sgx) 7293 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 7294 } 7295 7296 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 7297 struct nested_vmx_msrs *msrs) 7298 { 7299 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7300 msrs->misc_low |= 7301 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7302 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7303 VMX_MISC_ACTIVITY_HLT | 7304 VMX_MISC_ACTIVITY_WAIT_SIPI; 7305 msrs->misc_high = 0; 7306 } 7307 7308 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 7309 { 7310 /* 7311 * This MSR reports some information about VMX support. We 7312 * should return information about the VMX we emulate for the 7313 * guest, and the VMCS structure we give it - not about the 7314 * VMX support of the underlying hardware. 7315 */ 7316 msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, 7317 X86_MEMTYPE_WB); 7318 7319 msrs->basic |= VMX_BASIC_TRUE_CTLS; 7320 if (cpu_has_vmx_basic_inout()) 7321 msrs->basic |= VMX_BASIC_INOUT; 7322 if (cpu_has_vmx_basic_no_hw_errcode_cc()) 7323 msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC; 7324 } 7325 7326 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 7327 { 7328 /* 7329 * These MSRs specify bits which the guest must keep fixed on 7330 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 7331 * We picked the standard core2 setting. 7332 */ 7333 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 7334 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 7335 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 7336 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 7337 7338 /* These MSRs specify bits which the guest must keep fixed off. */ 7339 rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 7340 rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 7341 7342 if (vmx_umip_emulated()) 7343 msrs->cr4_fixed1 |= X86_CR4_UMIP; 7344 } 7345 7346 /* 7347 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 7348 * returned for the various VMX controls MSRs when nested VMX is enabled. 7349 * The same values should also be used to verify that vmcs12 control fields are 7350 * valid during nested entry from L1 to L2. 7351 * Each of these control msrs has a low and high 32-bit half: A low bit is on 7352 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7353 * bit in the high half is on if the corresponding bit in the control field 7354 * may be on. See also vmx_control_verify(). 7355 */ 7356 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7357 { 7358 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7359 7360 /* 7361 * Note that as a general rule, the high half of the MSRs (bits in 7362 * the control fields which may be 1) should be initialized by the 7363 * intersection of the underlying hardware's MSR (i.e., features which 7364 * can be supported) and the list of features we want to expose - 7365 * because they are known to be properly supported in our code. 7366 * Also, usually, the low half of the MSRs (bits which must be 1) can 7367 * be set to 0, meaning that L1 may turn off any of these bits. The 7368 * reason is that if one of these bits is necessary, it will appear 7369 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7370 * fields of vmcs01 and vmcs02, will turn these bits off - and 7371 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7372 * These rules have exceptions below. 7373 */ 7374 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7375 7376 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7377 7378 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7379 7380 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7381 7382 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7383 7384 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7385 7386 nested_vmx_setup_basic(msrs); 7387 7388 nested_vmx_setup_cr_fixed(msrs); 7389 7390 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7391 } 7392 7393 void nested_vmx_hardware_unsetup(void) 7394 { 7395 int i; 7396 7397 if (enable_shadow_vmcs) { 7398 for (i = 0; i < VMX_BITMAP_NR; i++) 7399 free_page((unsigned long)vmx_bitmap[i]); 7400 } 7401 } 7402 7403 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7404 { 7405 int i; 7406 7407 /* 7408 * Note! The set of supported vmcs12 fields is consumed by both VMX 7409 * MSR and shadow VMCS setup. 7410 */ 7411 nested_vmx_setup_vmcs12_fields(); 7412 7413 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 7414 7415 if (!cpu_has_vmx_shadow_vmcs()) 7416 enable_shadow_vmcs = 0; 7417 if (enable_shadow_vmcs) { 7418 for (i = 0; i < VMX_BITMAP_NR; i++) { 7419 /* 7420 * The vmx_bitmap is not tied to a VM and so should 7421 * not be charged to a memcg. 7422 */ 7423 vmx_bitmap[i] = (unsigned long *) 7424 __get_free_page(GFP_KERNEL); 7425 if (!vmx_bitmap[i]) { 7426 nested_vmx_hardware_unsetup(); 7427 return -ENOMEM; 7428 } 7429 } 7430 7431 init_vmcs_shadow_fields(); 7432 } 7433 7434 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7435 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7436 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7437 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7438 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7439 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7440 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7441 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7442 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7443 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7444 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7445 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7446 7447 return 0; 7448 } 7449 7450 struct kvm_x86_nested_ops vmx_nested_ops = { 7451 .leave_nested = vmx_leave_nested, 7452 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7453 .check_events = vmx_check_nested_events, 7454 .has_events = vmx_has_nested_events, 7455 .triple_fault = nested_vmx_triple_fault, 7456 .get_state = vmx_get_nested_state, 7457 .set_state = vmx_set_nested_state, 7458 .get_nested_state_pages = vmx_get_nested_state_pages, 7459 .write_log_dirty = nested_vmx_write_pml_buffer, 7460 #ifdef CONFIG_KVM_HYPERV 7461 .enable_evmcs = nested_enable_evmcs, 7462 .get_evmcs_version = nested_get_evmcs_version, 7463 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7464 #endif 7465 }; 7466