1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 #include <asm/msr.h> 10 11 #include "x86.h" 12 #include "cpuid.h" 13 #include "hyperv.h" 14 #include "mmu.h" 15 #include "nested.h" 16 #include "pmu.h" 17 #include "posted_intr.h" 18 #include "sgx.h" 19 #include "trace.h" 20 #include "vmx.h" 21 #include "smm.h" 22 #include "x86_ops.h" 23 24 static bool __read_mostly enable_shadow_vmcs = 1; 25 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 26 27 static bool __ro_after_init warn_on_missed_cc; 28 module_param(warn_on_missed_cc, bool, 0444); 29 30 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 31 32 /* 33 * Hyper-V requires all of these, so mark them as supported even though 34 * they are just treated the same as all-context. 35 */ 36 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 37 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 38 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 39 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 40 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 41 42 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 43 44 enum { 45 VMX_VMREAD_BITMAP, 46 VMX_VMWRITE_BITMAP, 47 VMX_BITMAP_NR 48 }; 49 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 50 51 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 52 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 53 54 struct shadow_vmcs_field { 55 u16 encoding; 56 u16 offset; 57 }; 58 static struct shadow_vmcs_field shadow_read_only_fields[] = { 59 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 60 #include "vmcs_shadow_fields.h" 61 }; 62 static int max_shadow_read_only_fields = 63 ARRAY_SIZE(shadow_read_only_fields); 64 65 static struct shadow_vmcs_field shadow_read_write_fields[] = { 66 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 67 #include "vmcs_shadow_fields.h" 68 }; 69 static int max_shadow_read_write_fields = 70 ARRAY_SIZE(shadow_read_write_fields); 71 72 static void init_vmcs_shadow_fields(void) 73 { 74 int i, j; 75 76 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 77 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 78 79 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 80 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 81 u16 field = entry.encoding; 82 83 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 84 (i + 1 == max_shadow_read_only_fields || 85 shadow_read_only_fields[i + 1].encoding != field + 1)) 86 pr_err("Missing field from shadow_read_only_field %x\n", 87 field + 1); 88 89 if (get_vmcs12_field_offset(field) < 0) 90 continue; 91 92 clear_bit(field, vmx_vmread_bitmap); 93 if (field & 1) 94 #ifdef CONFIG_X86_64 95 continue; 96 #else 97 entry.offset += sizeof(u32); 98 #endif 99 shadow_read_only_fields[j++] = entry; 100 } 101 max_shadow_read_only_fields = j; 102 103 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 104 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 105 u16 field = entry.encoding; 106 107 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 108 (i + 1 == max_shadow_read_write_fields || 109 shadow_read_write_fields[i + 1].encoding != field + 1)) 110 pr_err("Missing field from shadow_read_write_field %x\n", 111 field + 1); 112 113 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 114 field <= GUEST_TR_AR_BYTES, 115 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 116 117 if (get_vmcs12_field_offset(field) < 0) 118 continue; 119 120 /* 121 * KVM emulates PML and the VMX preemption timer irrespective 122 * of hardware support, but shadowing their related VMCS fields 123 * requires hardware support as the CPU will reject VMWRITEs to 124 * fields that don't exist. 125 */ 126 switch (field) { 127 case GUEST_PML_INDEX: 128 if (!cpu_has_vmx_pml()) 129 continue; 130 break; 131 case VMX_PREEMPTION_TIMER_VALUE: 132 if (!cpu_has_vmx_preemption_timer()) 133 continue; 134 break; 135 default: 136 break; 137 } 138 139 clear_bit(field, vmx_vmwrite_bitmap); 140 clear_bit(field, vmx_vmread_bitmap); 141 if (field & 1) 142 #ifdef CONFIG_X86_64 143 continue; 144 #else 145 entry.offset += sizeof(u32); 146 #endif 147 shadow_read_write_fields[j++] = entry; 148 } 149 max_shadow_read_write_fields = j; 150 } 151 152 /* 153 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 154 * set the success or error code of an emulated VMX instruction (as specified 155 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 156 * instruction. 157 */ 158 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 159 { 160 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 161 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 162 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 163 return kvm_skip_emulated_instruction(vcpu); 164 } 165 166 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 167 { 168 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 169 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 170 X86_EFLAGS_SF | X86_EFLAGS_OF)) 171 | X86_EFLAGS_CF); 172 return kvm_skip_emulated_instruction(vcpu); 173 } 174 175 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 176 u32 vm_instruction_error) 177 { 178 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 179 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 180 X86_EFLAGS_SF | X86_EFLAGS_OF)) 181 | X86_EFLAGS_ZF); 182 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 183 /* 184 * We don't need to force sync to shadow VMCS because 185 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 186 * fields and thus must be synced. 187 */ 188 if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) 189 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 190 191 return kvm_skip_emulated_instruction(vcpu); 192 } 193 194 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 195 { 196 struct vcpu_vmx *vmx = to_vmx(vcpu); 197 198 /* 199 * failValid writes the error number to the current VMCS, which 200 * can't be done if there isn't a current VMCS. 201 */ 202 if (vmx->nested.current_vmptr == INVALID_GPA && 203 !nested_vmx_is_evmptr12_valid(vmx)) 204 return nested_vmx_failInvalid(vcpu); 205 206 return nested_vmx_failValid(vcpu, vm_instruction_error); 207 } 208 209 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 210 { 211 /* TODO: not to reset guest simply here. */ 212 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 213 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 214 } 215 216 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 217 { 218 return fixed_bits_valid(control, low, high); 219 } 220 221 static inline u64 vmx_control_msr(u32 low, u32 high) 222 { 223 return low | ((u64)high << 32); 224 } 225 226 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 227 { 228 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 229 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 230 vmx->nested.need_vmcs12_to_shadow_sync = false; 231 } 232 233 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 234 { 235 #ifdef CONFIG_KVM_HYPERV 236 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 237 struct vcpu_vmx *vmx = to_vmx(vcpu); 238 239 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); 240 vmx->nested.hv_evmcs = NULL; 241 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 242 243 if (hv_vcpu) { 244 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 245 hv_vcpu->nested.vm_id = 0; 246 hv_vcpu->nested.vp_id = 0; 247 } 248 #endif 249 } 250 251 static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) 252 { 253 #ifdef CONFIG_KVM_HYPERV 254 struct vcpu_vmx *vmx = to_vmx(vcpu); 255 /* 256 * When Enlightened VMEntry is enabled on the calling CPU we treat 257 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 258 * way to distinguish it from VMCS12) and we must not corrupt it by 259 * writing to the non-existent 'launch_state' field. The area doesn't 260 * have to be the currently active EVMCS on the calling CPU and there's 261 * nothing KVM has to do to transition it from 'active' to 'non-active' 262 * state. It is possible that the area will stay mapped as 263 * vmx->nested.hv_evmcs but this shouldn't be a problem. 264 */ 265 if (!guest_cpu_cap_has_evmcs(vcpu) || 266 !evmptr_is_valid(nested_get_evmptr(vcpu))) 267 return false; 268 269 if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) 270 nested_release_evmcs(vcpu); 271 272 return true; 273 #else 274 return false; 275 #endif 276 } 277 278 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 279 struct loaded_vmcs *prev) 280 { 281 struct vmcs_host_state *dest, *src; 282 283 if (unlikely(!vmx->vt.guest_state_loaded)) 284 return; 285 286 src = &prev->host_state; 287 dest = &vmx->loaded_vmcs->host_state; 288 289 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 290 dest->ldt_sel = src->ldt_sel; 291 #ifdef CONFIG_X86_64 292 dest->ds_sel = src->ds_sel; 293 dest->es_sel = src->es_sel; 294 #endif 295 } 296 297 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 298 { 299 struct vcpu_vmx *vmx = to_vmx(vcpu); 300 struct loaded_vmcs *prev; 301 int cpu; 302 303 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 304 return; 305 306 cpu = get_cpu(); 307 prev = vmx->loaded_vmcs; 308 vmx->loaded_vmcs = vmcs; 309 vmx_vcpu_load_vmcs(vcpu, cpu); 310 vmx_sync_vmcs_host_state(vmx, prev); 311 put_cpu(); 312 313 kvm_clear_available_registers(vcpu, VMX_REGS_LAZY_LOAD_SET); 314 315 /* 316 * All lazily updated registers will be reloaded from VMCS12 on both 317 * vmentry and vmexit. 318 */ 319 kvm_reset_dirty_registers(vcpu); 320 } 321 322 static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) 323 { 324 struct vcpu_vmx *vmx = to_vmx(vcpu); 325 326 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); 327 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); 328 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); 329 vmx->nested.pi_desc = NULL; 330 } 331 332 /* 333 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 334 * just stops using VMX. 335 */ 336 static void free_nested(struct kvm_vcpu *vcpu) 337 { 338 struct vcpu_vmx *vmx = to_vmx(vcpu); 339 340 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 341 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 342 343 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 344 return; 345 346 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 347 348 vmx->nested.vmxon = false; 349 vmx->nested.smm.vmxon = false; 350 vmx->nested.vmxon_ptr = INVALID_GPA; 351 free_vpid(vmx->nested.vpid02); 352 vmx->nested.posted_intr_nv = -1; 353 vmx->nested.current_vmptr = INVALID_GPA; 354 if (enable_shadow_vmcs) { 355 vmx_disable_shadow_vmcs(vmx); 356 vmcs_clear(vmx->vmcs01.shadow_vmcs); 357 free_vmcs(vmx->vmcs01.shadow_vmcs); 358 vmx->vmcs01.shadow_vmcs = NULL; 359 } 360 kfree(vmx->nested.cached_vmcs12); 361 vmx->nested.cached_vmcs12 = NULL; 362 kfree(vmx->nested.cached_shadow_vmcs12); 363 vmx->nested.cached_shadow_vmcs12 = NULL; 364 365 nested_put_vmcs12_pages(vcpu); 366 367 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 368 369 nested_release_evmcs(vcpu); 370 371 free_loaded_vmcs(&vmx->nested.vmcs02); 372 } 373 374 /* 375 * Ensure that the current vmcs of the logical processor is the 376 * vmcs01 of the vcpu before calling free_nested(). 377 */ 378 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 379 { 380 vcpu_load(vcpu); 381 vmx_leave_nested(vcpu); 382 vcpu_put(vcpu); 383 } 384 385 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 386 387 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 388 { 389 return VALID_PAGE(root_hpa) && 390 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 391 } 392 393 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 394 gpa_t addr) 395 { 396 unsigned long roots = 0; 397 uint i; 398 struct kvm_mmu_root_info *cached_root; 399 400 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 401 402 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 403 cached_root = &vcpu->arch.mmu->prev_roots[i]; 404 405 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 406 eptp)) 407 roots |= KVM_MMU_ROOT_PREVIOUS(i); 408 } 409 if (roots) 410 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 411 } 412 413 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 414 struct x86_exception *fault, 415 bool from_hardware) 416 { 417 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 418 struct vcpu_vmx *vmx = to_vmx(vcpu); 419 unsigned long exit_qualification; 420 u32 vm_exit_reason; 421 422 if (vmx->nested.pml_full) { 423 vm_exit_reason = EXIT_REASON_PML_FULL; 424 vmx->nested.pml_full = false; 425 426 /* 427 * It should be impossible to trigger a nested PML Full VM-Exit 428 * for anything other than an EPT Violation from L2. KVM *can* 429 * trigger nEPT page fault injection in response to an EPT 430 * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT 431 * tables also changed, but KVM should not treat EPT Misconfig 432 * VM-Exits as writes. 433 */ 434 WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); 435 436 /* 437 * PML Full and EPT Violation VM-Exits both use bit 12 to report 438 * "NMI unblocking due to IRET", i.e. the bit can be propagated 439 * as-is from the original EXIT_QUALIFICATION. 440 */ 441 exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; 442 } else { 443 if (fault->error_code & PFERR_RSVD_MASK) { 444 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 445 exit_qualification = 0; 446 } else { 447 u64 mask = EPT_VIOLATION_GVA_IS_VALID | 448 EPT_VIOLATION_GVA_TRANSLATED; 449 450 if (vmx->nested.msrs.ept_caps & VMX_EPT_ADVANCED_VMEXIT_INFO_BIT) 451 mask |= EPT_VIOLATION_GVA_USER | 452 EPT_VIOLATION_GVA_WRITABLE | 453 EPT_VIOLATION_GVA_NX; 454 455 exit_qualification = fault->exit_qualification & ~mask; 456 457 /* 458 * Use the EXIT_QUALIFICATION from the VMCS if and only 459 * if the hardware VM-Exit from L2 was an EPT Violation. 460 * If the fault is synthesized, then EXIT_QUALIFICATION 461 * is stale and/or holds entirely different data. And 462 * conversely, KVM _must_ rely on EXIT_QUALIFICATION if 463 * the fault came from hardware, because KVM only sees 464 * and walks the faulting GPA. 465 */ 466 if (from_hardware) 467 exit_qualification |= vmx_get_exit_qual(vcpu) & mask; 468 else 469 exit_qualification |= fault->exit_qualification & mask; 470 471 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 472 } 473 474 /* 475 * Although the caller (kvm_inject_emulated_page_fault) would 476 * have already synced the faulting address in the shadow EPT 477 * tables for the current EPTP12, we also need to sync it for 478 * any other cached EPTP02s based on the same EP4TA, since the 479 * TLB associates mappings to the EP4TA rather than the full EPTP. 480 */ 481 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 482 fault->address); 483 } 484 485 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 486 vmcs12->guest_physical_address = fault->address; 487 } 488 489 static inline bool nested_ept_mbec_enabled(struct kvm_vcpu *vcpu) 490 { 491 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 492 493 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC); 494 } 495 496 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 497 { 498 struct vcpu_vmx *vmx = to_vmx(vcpu); 499 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 500 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 501 502 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 503 nested_ept_ad_enabled(vcpu), 504 nested_ept_mbec_enabled(vcpu), 505 nested_ept_get_eptp(vcpu)); 506 } 507 508 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 509 { 510 WARN_ON(mmu_is_nested(vcpu)); 511 512 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 513 nested_ept_new_eptp(vcpu); 514 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 515 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 516 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 517 518 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 519 } 520 521 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 522 { 523 vcpu->arch.mmu = &vcpu->arch.root_mmu; 524 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 525 } 526 527 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 528 u16 error_code) 529 { 530 bool inequality, bit; 531 532 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 533 inequality = 534 (error_code & vmcs12->page_fault_error_code_mask) != 535 vmcs12->page_fault_error_code_match; 536 return inequality ^ bit; 537 } 538 539 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 540 u32 error_code) 541 { 542 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 543 544 /* 545 * Drop bits 31:16 of the error code when performing the #PF mask+match 546 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 547 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 548 * error code. Including the to-be-dropped bits in the check might 549 * result in an "impossible" or missed exit from L1's perspective. 550 */ 551 if (vector == PF_VECTOR) 552 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 553 554 return (vmcs12->exception_bitmap & (1u << vector)); 555 } 556 557 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 558 struct vmcs12 *vmcs12) 559 { 560 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 561 return 0; 562 563 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 564 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 565 return -EINVAL; 566 567 return 0; 568 } 569 570 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 571 struct vmcs12 *vmcs12) 572 { 573 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 574 return 0; 575 576 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 577 return -EINVAL; 578 579 return 0; 580 } 581 582 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 583 struct vmcs12 *vmcs12) 584 { 585 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 586 return 0; 587 588 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 589 return -EINVAL; 590 591 if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4)) 592 return -EINVAL; 593 594 return 0; 595 } 596 597 /* 598 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 599 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 600 * only the "disable intercept" case needs to be handled. 601 */ 602 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 603 unsigned long *msr_bitmap_l0, 604 u32 msr, int type) 605 { 606 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 607 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 608 609 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 610 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 611 } 612 613 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 614 { 615 int msr; 616 617 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 618 unsigned word = msr / BITS_PER_LONG; 619 620 msr_bitmap[word] = ~0; 621 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 622 } 623 } 624 625 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 626 static inline \ 627 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 628 unsigned long *msr_bitmap_l1, \ 629 unsigned long *msr_bitmap_l0, u32 msr) \ 630 { \ 631 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 632 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 633 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 634 else \ 635 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 636 } 637 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 638 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 639 640 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 641 unsigned long *msr_bitmap_l1, 642 unsigned long *msr_bitmap_l0, 643 u32 msr, int types) 644 { 645 if (types & MSR_TYPE_R) 646 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 647 msr_bitmap_l0, msr); 648 if (types & MSR_TYPE_W) 649 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 650 msr_bitmap_l0, msr); 651 } 652 653 #define nested_vmx_merge_msr_bitmaps(msr, type) \ 654 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, \ 655 msr_bitmap_l0, msr, type) 656 657 #define nested_vmx_merge_msr_bitmaps_read(msr) \ 658 nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_R) 659 660 #define nested_vmx_merge_msr_bitmaps_write(msr) \ 661 nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_W) 662 663 #define nested_vmx_merge_msr_bitmaps_rw(msr) \ 664 nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_RW) 665 666 static void nested_vmx_merge_pmu_msr_bitmaps(struct kvm_vcpu *vcpu, 667 unsigned long *msr_bitmap_l1, 668 unsigned long *msr_bitmap_l0) 669 { 670 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 671 struct vcpu_vmx *vmx = to_vmx(vcpu); 672 int i; 673 674 /* 675 * Skip the merges if the vCPU doesn't have a mediated PMU MSR, i.e. if 676 * none of the MSRs can possibly be passed through to L1. 677 */ 678 if (!kvm_vcpu_has_mediated_pmu(vcpu)) 679 return; 680 681 for (i = 0; i < pmu->nr_arch_gp_counters; i++) { 682 nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PERFCTR0 + i); 683 nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PMC0 + i); 684 } 685 686 for (i = 0; i < pmu->nr_arch_fixed_counters; i++) 687 nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_FIXED_CTR0 + i); 688 689 nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_GLOBAL_CTRL); 690 nested_vmx_merge_msr_bitmaps_read(MSR_CORE_PERF_GLOBAL_STATUS); 691 nested_vmx_merge_msr_bitmaps_write(MSR_CORE_PERF_GLOBAL_OVF_CTRL); 692 } 693 694 /* 695 * Merge L0's and L1's MSR bitmap, return false to indicate that 696 * we do not use the hardware. 697 */ 698 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 699 struct vmcs12 *vmcs12) 700 { 701 struct vcpu_vmx *vmx = to_vmx(vcpu); 702 int msr; 703 unsigned long *msr_bitmap_l1; 704 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 705 struct kvm_host_map map; 706 707 /* Nothing to do if the MSR bitmap is not in use. */ 708 if (!cpu_has_vmx_msr_bitmap() || 709 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 710 return false; 711 712 /* 713 * MSR bitmap update can be skipped when: 714 * - MSR bitmap for L1 hasn't changed. 715 * - Nested hypervisor (L1) is attempting to launch the same L2 as 716 * before. 717 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 718 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 719 */ 720 if (!vmx->nested.force_msr_bitmap_recalc) { 721 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 722 723 if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && 724 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 725 return true; 726 } 727 728 if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) 729 return false; 730 731 msr_bitmap_l1 = (unsigned long *)map.hva; 732 733 /* 734 * To keep the control flow simple, pay eight 8-byte writes (sixteen 735 * 4-byte writes on 32-bit systems) up front to enable intercepts for 736 * the x2APIC MSR range and selectively toggle those relevant to L2. 737 */ 738 enable_x2apic_msr_intercepts(msr_bitmap_l0); 739 740 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 741 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 742 /* 743 * L0 need not intercept reads for MSRs between 0x800 744 * and 0x8ff, it just lets the processor take the value 745 * from the virtual-APIC page; take those 256 bits 746 * directly from the L1 bitmap. 747 */ 748 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 749 unsigned word = msr / BITS_PER_LONG; 750 751 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 752 } 753 } 754 755 nested_vmx_disable_intercept_for_x2apic_msr( 756 msr_bitmap_l1, msr_bitmap_l0, 757 X2APIC_MSR(APIC_TASKPRI), 758 MSR_TYPE_R | MSR_TYPE_W); 759 760 if (nested_cpu_has_vid(vmcs12)) { 761 nested_vmx_disable_intercept_for_x2apic_msr( 762 msr_bitmap_l1, msr_bitmap_l0, 763 X2APIC_MSR(APIC_EOI), 764 MSR_TYPE_W); 765 nested_vmx_disable_intercept_for_x2apic_msr( 766 msr_bitmap_l1, msr_bitmap_l0, 767 X2APIC_MSR(APIC_SELF_IPI), 768 MSR_TYPE_W); 769 } 770 } 771 772 /* 773 * Always check vmcs01's bitmap to honor userspace MSR filters and any 774 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 775 */ 776 #ifdef CONFIG_X86_64 777 nested_vmx_merge_msr_bitmaps_rw(MSR_FS_BASE); 778 nested_vmx_merge_msr_bitmaps_rw(MSR_GS_BASE); 779 nested_vmx_merge_msr_bitmaps_rw(MSR_KERNEL_GS_BASE); 780 #endif 781 nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_SPEC_CTRL); 782 nested_vmx_merge_msr_bitmaps_write(MSR_IA32_PRED_CMD); 783 nested_vmx_merge_msr_bitmaps_write(MSR_IA32_FLUSH_CMD); 784 785 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 786 MSR_IA32_APERF, MSR_TYPE_R); 787 788 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 789 MSR_IA32_MPERF, MSR_TYPE_R); 790 791 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 792 MSR_IA32_U_CET, MSR_TYPE_RW); 793 794 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 795 MSR_IA32_S_CET, MSR_TYPE_RW); 796 797 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 798 MSR_IA32_PL0_SSP, MSR_TYPE_RW); 799 800 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 801 MSR_IA32_PL1_SSP, MSR_TYPE_RW); 802 803 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 804 MSR_IA32_PL2_SSP, MSR_TYPE_RW); 805 806 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 807 MSR_IA32_PL3_SSP, MSR_TYPE_RW); 808 809 nested_vmx_merge_pmu_msr_bitmaps(vcpu, msr_bitmap_l1, msr_bitmap_l0); 810 811 kvm_vcpu_unmap(vcpu, &map); 812 813 vmx->nested.force_msr_bitmap_recalc = false; 814 815 return true; 816 } 817 818 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 819 struct vmcs12 *vmcs12) 820 { 821 struct vcpu_vmx *vmx = to_vmx(vcpu); 822 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 823 824 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 825 vmcs12->vmcs_link_pointer == INVALID_GPA) 826 return; 827 828 if (ghc->gpa != vmcs12->vmcs_link_pointer && 829 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 830 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 831 return; 832 833 kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), 834 VMCS12_SIZE); 835 } 836 837 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 838 struct vmcs12 *vmcs12) 839 { 840 struct vcpu_vmx *vmx = to_vmx(vcpu); 841 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 842 843 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 844 vmcs12->vmcs_link_pointer == INVALID_GPA) 845 return; 846 847 if (ghc->gpa != vmcs12->vmcs_link_pointer && 848 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 849 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 850 return; 851 852 kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), 853 VMCS12_SIZE); 854 } 855 856 /* 857 * In nested virtualization, check if L1 has set 858 * VM_EXIT_ACK_INTR_ON_EXIT 859 */ 860 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 861 { 862 return get_vmcs12(vcpu)->vm_exit_controls & 863 VM_EXIT_ACK_INTR_ON_EXIT; 864 } 865 866 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 867 struct vmcs12 *vmcs12) 868 { 869 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 870 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 871 return -EINVAL; 872 else 873 return 0; 874 } 875 876 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 877 struct vmcs12 *vmcs12) 878 { 879 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 880 !nested_cpu_has_apic_reg_virt(vmcs12) && 881 !nested_cpu_has_vid(vmcs12) && 882 !nested_cpu_has_posted_intr(vmcs12)) 883 return 0; 884 885 /* 886 * If virtualize x2apic mode is enabled, 887 * virtualize apic access must be disabled. 888 */ 889 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 890 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 891 return -EINVAL; 892 893 /* 894 * If virtual interrupt delivery is enabled, 895 * we must exit on external interrupts. 896 */ 897 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 898 return -EINVAL; 899 900 /* 901 * bits 15:8 should be zero in posted_intr_nv, 902 * the descriptor address has been already checked 903 * in nested_get_vmcs12_pages. 904 * 905 * bits 5:0 of posted_intr_desc_addr should be zero. 906 */ 907 if (nested_cpu_has_posted_intr(vmcs12) && 908 (CC(!nested_cpu_has_vid(vmcs12)) || 909 CC(!nested_exit_intr_ack_set(vcpu)) || 910 CC((vmcs12->posted_intr_nv & 0xff00)) || 911 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 912 return -EINVAL; 913 914 /* tpr shadow is needed by all apicv features. */ 915 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 916 return -EINVAL; 917 918 return 0; 919 } 920 921 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 922 { 923 struct vcpu_vmx *vmx = to_vmx(vcpu); 924 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 925 vmx->nested.msrs.misc_high); 926 927 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 928 } 929 930 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 931 u32 count, u64 addr) 932 { 933 if (count == 0) 934 return 0; 935 936 /* 937 * Exceeding the limit results in architecturally _undefined_ behavior, 938 * i.e. KVM is allowed to do literally anything in response to a bad 939 * limit. Immediately generate a consistency check so that code that 940 * consumes the count doesn't need to worry about extreme edge cases. 941 */ 942 if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) 943 return -EINVAL; 944 945 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 946 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 947 return -EINVAL; 948 949 return 0; 950 } 951 952 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 953 struct vmcs12 *vmcs12) 954 { 955 if (CC(nested_vmx_check_msr_switch(vcpu, 956 vmcs12->vm_exit_msr_load_count, 957 vmcs12->vm_exit_msr_load_addr)) || 958 CC(nested_vmx_check_msr_switch(vcpu, 959 vmcs12->vm_exit_msr_store_count, 960 vmcs12->vm_exit_msr_store_addr))) 961 return -EINVAL; 962 963 return 0; 964 } 965 966 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 967 struct vmcs12 *vmcs12) 968 { 969 if (CC(nested_vmx_check_msr_switch(vcpu, 970 vmcs12->vm_entry_msr_load_count, 971 vmcs12->vm_entry_msr_load_addr))) 972 return -EINVAL; 973 974 return 0; 975 } 976 977 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 978 struct vmcs12 *vmcs12) 979 { 980 if (!nested_cpu_has_pml(vmcs12)) 981 return 0; 982 983 if (CC(!nested_cpu_has_ept(vmcs12)) || 984 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 985 return -EINVAL; 986 987 return 0; 988 } 989 990 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 991 struct vmcs12 *vmcs12) 992 { 993 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 994 !nested_cpu_has_ept(vmcs12))) 995 return -EINVAL; 996 return 0; 997 } 998 999 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 1000 struct vmcs12 *vmcs12) 1001 { 1002 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 1003 !nested_cpu_has_ept(vmcs12))) 1004 return -EINVAL; 1005 return 0; 1006 } 1007 1008 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 1009 struct vmcs12 *vmcs12) 1010 { 1011 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 1012 return 0; 1013 1014 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 1015 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 1016 return -EINVAL; 1017 1018 return 0; 1019 } 1020 1021 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 1022 struct vmx_msr_entry *e) 1023 { 1024 /* x2APIC MSR accesses are not allowed */ 1025 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 1026 return -EINVAL; 1027 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 1028 CC(e->index == MSR_IA32_UCODE_REV)) 1029 return -EINVAL; 1030 if (CC(e->reserved != 0)) 1031 return -EINVAL; 1032 return 0; 1033 } 1034 1035 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 1036 struct vmx_msr_entry *e) 1037 { 1038 if (CC(e->index == MSR_FS_BASE) || 1039 CC(e->index == MSR_GS_BASE) || 1040 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 1041 nested_vmx_msr_check_common(vcpu, e)) 1042 return -EINVAL; 1043 return 0; 1044 } 1045 1046 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 1047 struct vmx_msr_entry *e) 1048 { 1049 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 1050 nested_vmx_msr_check_common(vcpu, e)) 1051 return -EINVAL; 1052 return 0; 1053 } 1054 1055 /* 1056 * Load guest's/host's msr at nested entry/exit. 1057 * return 0 for success, entry index for failure. 1058 * 1059 * One of the failure modes for MSR load/store is when a list exceeds the 1060 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 1061 * as possible, process all valid entries before failing rather than precheck 1062 * for a capacity violation. 1063 */ 1064 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1065 { 1066 u32 i; 1067 struct vmx_msr_entry e; 1068 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1069 1070 for (i = 0; i < count; i++) { 1071 if (WARN_ON_ONCE(i >= max_msr_list_size)) 1072 goto fail; 1073 1074 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 1075 &e, sizeof(e))) { 1076 pr_debug_ratelimited( 1077 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1078 __func__, i, gpa + i * sizeof(e)); 1079 goto fail; 1080 } 1081 if (nested_vmx_load_msr_check(vcpu, &e)) { 1082 pr_debug_ratelimited( 1083 "%s check failed (%u, 0x%x, 0x%x)\n", 1084 __func__, i, e.index, e.reserved); 1085 goto fail; 1086 } 1087 if (kvm_emulate_msr_write(vcpu, e.index, e.value)) { 1088 pr_debug_ratelimited( 1089 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1090 __func__, i, e.index, e.value); 1091 goto fail; 1092 } 1093 } 1094 return 0; 1095 fail: 1096 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 1097 return i + 1; 1098 } 1099 1100 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 1101 u32 msr_index, 1102 u64 *data) 1103 { 1104 struct vcpu_vmx *vmx = to_vmx(vcpu); 1105 1106 /* 1107 * If the L0 hypervisor stored a more accurate value for the TSC that 1108 * does not include the time taken for emulation of the L2->L1 1109 * VM-exit in L0, use the more accurate value. 1110 */ 1111 if (msr_index == MSR_IA32_TSC && vmx->nested.tsc_autostore_slot >= 0) { 1112 int slot = vmx->nested.tsc_autostore_slot; 1113 u64 host_tsc = vmx->msr_autostore.val[slot].value; 1114 1115 *data = kvm_read_l1_tsc(vcpu, host_tsc); 1116 return true; 1117 } 1118 1119 if (kvm_emulate_msr_read(vcpu, msr_index, data)) { 1120 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1121 msr_index); 1122 return false; 1123 } 1124 return true; 1125 } 1126 1127 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 1128 struct vmx_msr_entry *e) 1129 { 1130 if (kvm_vcpu_read_guest(vcpu, 1131 gpa + i * sizeof(*e), 1132 e, 2 * sizeof(u32))) { 1133 pr_debug_ratelimited( 1134 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1135 __func__, i, gpa + i * sizeof(*e)); 1136 return false; 1137 } 1138 if (nested_vmx_store_msr_check(vcpu, e)) { 1139 pr_debug_ratelimited( 1140 "%s check failed (%u, 0x%x, 0x%x)\n", 1141 __func__, i, e->index, e->reserved); 1142 return false; 1143 } 1144 return true; 1145 } 1146 1147 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1148 { 1149 u64 data; 1150 u32 i; 1151 struct vmx_msr_entry e; 1152 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1153 1154 for (i = 0; i < count; i++) { 1155 if (WARN_ON_ONCE(i >= max_msr_list_size)) 1156 return -EINVAL; 1157 1158 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1159 return -EINVAL; 1160 1161 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1162 return -EINVAL; 1163 1164 if (kvm_vcpu_write_guest(vcpu, 1165 gpa + i * sizeof(e) + 1166 offsetof(struct vmx_msr_entry, value), 1167 &data, sizeof(data))) { 1168 pr_debug_ratelimited( 1169 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1170 __func__, i, e.index, data); 1171 return -EINVAL; 1172 } 1173 } 1174 return 0; 1175 } 1176 1177 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1178 { 1179 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1180 u32 count = vmcs12->vm_exit_msr_store_count; 1181 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1182 struct vmx_msr_entry e; 1183 u32 i; 1184 1185 for (i = 0; i < count; i++) { 1186 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1187 return false; 1188 1189 if (e.index == msr_index) 1190 return true; 1191 } 1192 return false; 1193 } 1194 1195 /* 1196 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1197 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1198 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1199 * @entry_failure_code. 1200 */ 1201 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1202 bool nested_ept, bool reload_pdptrs, 1203 enum vm_entry_failure_code *entry_failure_code) 1204 { 1205 if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { 1206 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1207 return -EINVAL; 1208 } 1209 1210 /* 1211 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1212 * must not be dereferenced. 1213 */ 1214 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1215 CC(!load_pdptrs(vcpu, cr3))) { 1216 *entry_failure_code = ENTRY_FAIL_PDPTE; 1217 return -EINVAL; 1218 } 1219 1220 vcpu->arch.cr3 = cr3; 1221 kvm_register_mark_dirty(vcpu, VCPU_REG_CR3); 1222 1223 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1224 kvm_init_mmu(vcpu); 1225 1226 if (!nested_ept) 1227 kvm_mmu_new_pgd(vcpu, cr3); 1228 1229 return 0; 1230 } 1231 1232 /* 1233 * Returns if KVM is able to config CPU to tag TLB entries 1234 * populated by L2 differently than TLB entries populated 1235 * by L1. 1236 * 1237 * If L0 uses EPT, L1 and L2 run with different EPTP because 1238 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1239 * are tagged with different EPTP. 1240 * 1241 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1242 * with different VPID (L1 entries are tagged with vmx->vpid 1243 * while L2 entries are tagged with vmx->nested.vpid02). 1244 */ 1245 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1246 { 1247 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1248 1249 return enable_ept || 1250 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1251 } 1252 1253 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1254 struct vmcs12 *vmcs12, 1255 bool is_vmenter) 1256 { 1257 struct vcpu_vmx *vmx = to_vmx(vcpu); 1258 1259 /* Handle pending Hyper-V TLB flush requests */ 1260 kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); 1261 1262 /* 1263 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 1264 * same VPID as the host, and so architecturally, linear and combined 1265 * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM 1266 * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, 1267 * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This 1268 * is required if VPID is disabled in KVM, as a TLB flush (there are no 1269 * VPIDs) still occurs from L1's perspective, and KVM may need to 1270 * synchronize the MMU in response to the guest TLB flush. 1271 * 1272 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1273 * EPT is a special snowflake, as guest-physical mappings aren't 1274 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1275 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1276 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1277 * those mappings. 1278 */ 1279 if (!nested_cpu_has_vpid(vmcs12)) { 1280 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1281 return; 1282 } 1283 1284 /* L2 should never have a VPID if VPID is disabled. */ 1285 WARN_ON(!enable_vpid); 1286 1287 /* 1288 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1289 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1290 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1291 * that the new vpid12 has never been used and thus represents a new 1292 * guest ASID that cannot have entries in the TLB. 1293 */ 1294 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1295 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1296 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1297 return; 1298 } 1299 1300 /* 1301 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1302 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1303 * KVM was unable to allocate a VPID for L2, flush the current context 1304 * as the effective ASID is common to both L1 and L2. 1305 */ 1306 if (!nested_has_guest_tlb_tag(vcpu)) 1307 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1308 } 1309 1310 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1311 { 1312 superset &= mask; 1313 subset &= mask; 1314 1315 return (superset | subset) == superset; 1316 } 1317 1318 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1319 { 1320 const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | 1321 VMX_BASIC_INOUT | 1322 VMX_BASIC_TRUE_CTLS | 1323 VMX_BASIC_NO_HW_ERROR_CODE_CC; 1324 1325 const u64 reserved_bits = GENMASK_ULL(63, 57) | 1326 GENMASK_ULL(47, 45) | 1327 BIT_ULL(31); 1328 1329 u64 vmx_basic = vmcs_config.nested.basic; 1330 1331 BUILD_BUG_ON(feature_bits & reserved_bits); 1332 1333 /* 1334 * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has 1335 * inverted polarity), the incoming value must not set feature bits or 1336 * reserved bits that aren't allowed/supported by KVM. Fields, i.e. 1337 * multi-bit values, are explicitly checked below. 1338 */ 1339 if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) 1340 return -EINVAL; 1341 1342 /* 1343 * KVM does not emulate a version of VMX that constrains physical 1344 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1345 */ 1346 if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 1347 return -EINVAL; 1348 1349 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1350 vmx_basic_vmcs_revision_id(data)) 1351 return -EINVAL; 1352 1353 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1354 return -EINVAL; 1355 1356 vmx->nested.msrs.basic = data; 1357 return 0; 1358 } 1359 1360 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1361 u32 **low, u32 **high) 1362 { 1363 switch (msr_index) { 1364 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1365 *low = &msrs->pinbased_ctls_low; 1366 *high = &msrs->pinbased_ctls_high; 1367 break; 1368 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1369 *low = &msrs->procbased_ctls_low; 1370 *high = &msrs->procbased_ctls_high; 1371 break; 1372 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1373 *low = &msrs->exit_ctls_low; 1374 *high = &msrs->exit_ctls_high; 1375 break; 1376 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1377 *low = &msrs->entry_ctls_low; 1378 *high = &msrs->entry_ctls_high; 1379 break; 1380 case MSR_IA32_VMX_PROCBASED_CTLS2: 1381 *low = &msrs->secondary_ctls_low; 1382 *high = &msrs->secondary_ctls_high; 1383 break; 1384 default: 1385 BUG(); 1386 } 1387 } 1388 1389 static int 1390 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1391 { 1392 u32 *lowp, *highp; 1393 u64 supported; 1394 1395 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1396 1397 supported = vmx_control_msr(*lowp, *highp); 1398 1399 /* Check must-be-1 bits are still 1. */ 1400 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1401 return -EINVAL; 1402 1403 /* Check must-be-0 bits are still 0. */ 1404 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1405 return -EINVAL; 1406 1407 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1408 *lowp = data; 1409 *highp = data >> 32; 1410 return 0; 1411 } 1412 1413 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1414 { 1415 const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | 1416 VMX_MISC_ACTIVITY_HLT | 1417 VMX_MISC_ACTIVITY_SHUTDOWN | 1418 VMX_MISC_ACTIVITY_WAIT_SIPI | 1419 VMX_MISC_INTEL_PT | 1420 VMX_MISC_RDMSR_IN_SMM | 1421 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 1422 VMX_MISC_VMXOFF_BLOCK_SMI | 1423 VMX_MISC_ZERO_LEN_INS; 1424 1425 const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); 1426 1427 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1428 vmcs_config.nested.misc_high); 1429 1430 BUILD_BUG_ON(feature_bits & reserved_bits); 1431 1432 /* 1433 * The incoming value must not set feature bits or reserved bits that 1434 * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are 1435 * explicitly checked below. 1436 */ 1437 if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) 1438 return -EINVAL; 1439 1440 if ((vmx->nested.msrs.pinbased_ctls_high & 1441 PIN_BASED_VMX_PREEMPTION_TIMER) && 1442 vmx_misc_preemption_timer_rate(data) != 1443 vmx_misc_preemption_timer_rate(vmx_misc)) 1444 return -EINVAL; 1445 1446 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1447 return -EINVAL; 1448 1449 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1450 return -EINVAL; 1451 1452 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1453 return -EINVAL; 1454 1455 vmx->nested.msrs.misc_low = data; 1456 vmx->nested.msrs.misc_high = data >> 32; 1457 1458 return 0; 1459 } 1460 1461 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1462 { 1463 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1464 vmcs_config.nested.vpid_caps); 1465 1466 /* Every bit is either reserved or a feature bit. */ 1467 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1468 return -EINVAL; 1469 1470 vmx->nested.msrs.ept_caps = data; 1471 vmx->nested.msrs.vpid_caps = data >> 32; 1472 return 0; 1473 } 1474 1475 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1476 { 1477 switch (msr_index) { 1478 case MSR_IA32_VMX_CR0_FIXED0: 1479 return &msrs->cr0_fixed0; 1480 case MSR_IA32_VMX_CR4_FIXED0: 1481 return &msrs->cr4_fixed0; 1482 default: 1483 BUG(); 1484 } 1485 } 1486 1487 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1488 { 1489 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1490 1491 /* 1492 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1493 * must be 1 in the restored value. 1494 */ 1495 if (!is_bitwise_subset(data, *msr, -1ULL)) 1496 return -EINVAL; 1497 1498 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1499 return 0; 1500 } 1501 1502 /* 1503 * Called when userspace is restoring VMX MSRs. 1504 * 1505 * Returns 0 on success, non-0 otherwise. 1506 */ 1507 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1508 { 1509 struct vcpu_vmx *vmx = to_vmx(vcpu); 1510 1511 /* 1512 * Don't allow changes to the VMX capability MSRs while the vCPU 1513 * is in VMX operation. 1514 */ 1515 if (vmx->nested.vmxon) 1516 return -EBUSY; 1517 1518 switch (msr_index) { 1519 case MSR_IA32_VMX_BASIC: 1520 return vmx_restore_vmx_basic(vmx, data); 1521 case MSR_IA32_VMX_PINBASED_CTLS: 1522 case MSR_IA32_VMX_PROCBASED_CTLS: 1523 case MSR_IA32_VMX_EXIT_CTLS: 1524 case MSR_IA32_VMX_ENTRY_CTLS: 1525 /* 1526 * The "non-true" VMX capability MSRs are generated from the 1527 * "true" MSRs, so we do not support restoring them directly. 1528 * 1529 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1530 * should restore the "true" MSRs with the must-be-1 bits 1531 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1532 * DEFAULT SETTINGS". 1533 */ 1534 return -EINVAL; 1535 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1536 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1537 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1538 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1539 case MSR_IA32_VMX_PROCBASED_CTLS2: 1540 return vmx_restore_control_msr(vmx, msr_index, data); 1541 case MSR_IA32_VMX_MISC: 1542 return vmx_restore_vmx_misc(vmx, data); 1543 case MSR_IA32_VMX_CR0_FIXED0: 1544 case MSR_IA32_VMX_CR4_FIXED0: 1545 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1546 case MSR_IA32_VMX_CR0_FIXED1: 1547 case MSR_IA32_VMX_CR4_FIXED1: 1548 /* 1549 * These MSRs are generated based on the vCPU's CPUID, so we 1550 * do not support restoring them directly. 1551 */ 1552 return -EINVAL; 1553 case MSR_IA32_VMX_EPT_VPID_CAP: 1554 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1555 case MSR_IA32_VMX_VMCS_ENUM: 1556 vmx->nested.msrs.vmcs_enum = data; 1557 return 0; 1558 case MSR_IA32_VMX_VMFUNC: 1559 if (data & ~vmcs_config.nested.vmfunc_controls) 1560 return -EINVAL; 1561 vmx->nested.msrs.vmfunc_controls = data; 1562 return 0; 1563 default: 1564 /* 1565 * The rest of the VMX capability MSRs do not support restore. 1566 */ 1567 return -EINVAL; 1568 } 1569 } 1570 1571 /* Returns 0 on success, non-0 otherwise. */ 1572 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1573 { 1574 switch (msr_index) { 1575 case MSR_IA32_VMX_BASIC: 1576 *pdata = msrs->basic; 1577 break; 1578 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1579 case MSR_IA32_VMX_PINBASED_CTLS: 1580 *pdata = vmx_control_msr( 1581 msrs->pinbased_ctls_low, 1582 msrs->pinbased_ctls_high); 1583 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1584 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1585 break; 1586 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1587 case MSR_IA32_VMX_PROCBASED_CTLS: 1588 *pdata = vmx_control_msr( 1589 msrs->procbased_ctls_low, 1590 msrs->procbased_ctls_high); 1591 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1592 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1593 break; 1594 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1595 case MSR_IA32_VMX_EXIT_CTLS: 1596 *pdata = vmx_control_msr( 1597 msrs->exit_ctls_low, 1598 msrs->exit_ctls_high); 1599 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1600 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1601 break; 1602 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1603 case MSR_IA32_VMX_ENTRY_CTLS: 1604 *pdata = vmx_control_msr( 1605 msrs->entry_ctls_low, 1606 msrs->entry_ctls_high); 1607 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1608 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1609 break; 1610 case MSR_IA32_VMX_MISC: 1611 *pdata = vmx_control_msr( 1612 msrs->misc_low, 1613 msrs->misc_high); 1614 break; 1615 case MSR_IA32_VMX_CR0_FIXED0: 1616 *pdata = msrs->cr0_fixed0; 1617 break; 1618 case MSR_IA32_VMX_CR0_FIXED1: 1619 *pdata = msrs->cr0_fixed1; 1620 break; 1621 case MSR_IA32_VMX_CR4_FIXED0: 1622 *pdata = msrs->cr4_fixed0; 1623 break; 1624 case MSR_IA32_VMX_CR4_FIXED1: 1625 *pdata = msrs->cr4_fixed1; 1626 break; 1627 case MSR_IA32_VMX_VMCS_ENUM: 1628 *pdata = msrs->vmcs_enum; 1629 break; 1630 case MSR_IA32_VMX_PROCBASED_CTLS2: 1631 *pdata = vmx_control_msr( 1632 msrs->secondary_ctls_low, 1633 msrs->secondary_ctls_high); 1634 break; 1635 case MSR_IA32_VMX_EPT_VPID_CAP: 1636 *pdata = msrs->ept_caps | 1637 ((u64)msrs->vpid_caps << 32); 1638 break; 1639 case MSR_IA32_VMX_VMFUNC: 1640 *pdata = msrs->vmfunc_controls; 1641 break; 1642 default: 1643 return 1; 1644 } 1645 1646 return 0; 1647 } 1648 1649 /* 1650 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1651 * been modified by the L1 guest. Note, "writable" in this context means 1652 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1653 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1654 * VM-exit information fields (which are actually writable if the vCPU is 1655 * configured to support "VMWRITE to any supported field in the VMCS"). 1656 */ 1657 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1658 { 1659 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1660 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1661 struct shadow_vmcs_field field; 1662 unsigned long val; 1663 int i; 1664 1665 if (WARN_ON(!shadow_vmcs)) 1666 return; 1667 1668 preempt_disable(); 1669 1670 vmcs_load(shadow_vmcs); 1671 1672 for (i = 0; i < max_shadow_read_write_fields; i++) { 1673 field = shadow_read_write_fields[i]; 1674 val = __vmcs_readl(field.encoding); 1675 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1676 } 1677 1678 vmcs_clear(shadow_vmcs); 1679 vmcs_load(vmx->loaded_vmcs->vmcs); 1680 1681 preempt_enable(); 1682 } 1683 1684 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1685 { 1686 const struct shadow_vmcs_field *fields[] = { 1687 shadow_read_write_fields, 1688 shadow_read_only_fields 1689 }; 1690 const int max_fields[] = { 1691 max_shadow_read_write_fields, 1692 max_shadow_read_only_fields 1693 }; 1694 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1695 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1696 struct shadow_vmcs_field field; 1697 unsigned long val; 1698 int i, q; 1699 1700 if (WARN_ON(!shadow_vmcs)) 1701 return; 1702 1703 vmcs_load(shadow_vmcs); 1704 1705 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1706 for (i = 0; i < max_fields[q]; i++) { 1707 field = fields[q][i]; 1708 val = vmcs12_read_any(vmcs12, field.encoding, 1709 field.offset); 1710 __vmcs_writel(field.encoding, val); 1711 } 1712 } 1713 1714 vmcs_clear(shadow_vmcs); 1715 vmcs_load(vmx->loaded_vmcs->vmcs); 1716 } 1717 1718 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1719 { 1720 #ifdef CONFIG_KVM_HYPERV 1721 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1722 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1723 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1724 1725 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1726 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1727 vmcs12->guest_rip = evmcs->guest_rip; 1728 1729 if (unlikely(!(hv_clean_fields & 1730 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1731 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1732 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1733 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1734 } 1735 1736 if (unlikely(!(hv_clean_fields & 1737 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1738 vmcs12->guest_rsp = evmcs->guest_rsp; 1739 vmcs12->guest_rflags = evmcs->guest_rflags; 1740 vmcs12->guest_interruptibility_info = 1741 evmcs->guest_interruptibility_info; 1742 /* 1743 * Not present in struct vmcs12: 1744 * vmcs12->guest_ssp = evmcs->guest_ssp; 1745 */ 1746 } 1747 1748 if (unlikely(!(hv_clean_fields & 1749 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1750 vmcs12->cpu_based_vm_exec_control = 1751 evmcs->cpu_based_vm_exec_control; 1752 } 1753 1754 if (unlikely(!(hv_clean_fields & 1755 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1756 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1757 } 1758 1759 if (unlikely(!(hv_clean_fields & 1760 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1761 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1762 } 1763 1764 if (unlikely(!(hv_clean_fields & 1765 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1766 vmcs12->vm_entry_intr_info_field = 1767 evmcs->vm_entry_intr_info_field; 1768 vmcs12->vm_entry_exception_error_code = 1769 evmcs->vm_entry_exception_error_code; 1770 vmcs12->vm_entry_instruction_len = 1771 evmcs->vm_entry_instruction_len; 1772 } 1773 1774 if (unlikely(!(hv_clean_fields & 1775 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1776 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1777 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1778 vmcs12->host_cr0 = evmcs->host_cr0; 1779 vmcs12->host_cr3 = evmcs->host_cr3; 1780 vmcs12->host_cr4 = evmcs->host_cr4; 1781 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1782 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1783 vmcs12->host_rip = evmcs->host_rip; 1784 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1785 vmcs12->host_es_selector = evmcs->host_es_selector; 1786 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1787 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1788 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1789 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1790 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1791 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1792 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1793 /* 1794 * Not present in struct vmcs12: 1795 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1796 * vmcs12->host_ssp = evmcs->host_ssp; 1797 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1798 */ 1799 } 1800 1801 if (unlikely(!(hv_clean_fields & 1802 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1803 vmcs12->pin_based_vm_exec_control = 1804 evmcs->pin_based_vm_exec_control; 1805 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1806 vmcs12->secondary_vm_exec_control = 1807 evmcs->secondary_vm_exec_control; 1808 } 1809 1810 if (unlikely(!(hv_clean_fields & 1811 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1812 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1813 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1814 } 1815 1816 if (unlikely(!(hv_clean_fields & 1817 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1818 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1819 } 1820 1821 if (unlikely(!(hv_clean_fields & 1822 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1823 vmcs12->guest_es_base = evmcs->guest_es_base; 1824 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1825 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1826 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1827 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1828 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1829 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1830 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1831 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1832 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1833 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1834 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1835 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1836 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1837 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1838 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1839 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1840 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1841 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1842 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1843 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1844 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1845 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1846 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1847 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1848 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1849 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1850 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1851 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1852 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1853 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1854 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1855 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1856 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1857 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1858 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1859 } 1860 1861 if (unlikely(!(hv_clean_fields & 1862 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1863 vmcs12->tsc_offset = evmcs->tsc_offset; 1864 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1865 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1866 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1867 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1868 } 1869 1870 if (unlikely(!(hv_clean_fields & 1871 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1872 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1873 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1874 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1875 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1876 vmcs12->guest_cr0 = evmcs->guest_cr0; 1877 vmcs12->guest_cr3 = evmcs->guest_cr3; 1878 vmcs12->guest_cr4 = evmcs->guest_cr4; 1879 vmcs12->guest_dr7 = evmcs->guest_dr7; 1880 } 1881 1882 if (unlikely(!(hv_clean_fields & 1883 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1884 vmcs12->host_fs_base = evmcs->host_fs_base; 1885 vmcs12->host_gs_base = evmcs->host_gs_base; 1886 vmcs12->host_tr_base = evmcs->host_tr_base; 1887 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1888 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1889 vmcs12->host_rsp = evmcs->host_rsp; 1890 } 1891 1892 if (unlikely(!(hv_clean_fields & 1893 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1894 vmcs12->ept_pointer = evmcs->ept_pointer; 1895 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1896 } 1897 1898 if (unlikely(!(hv_clean_fields & 1899 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1900 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1901 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1902 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1903 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1904 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1905 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1906 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1907 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1908 vmcs12->guest_pending_dbg_exceptions = 1909 evmcs->guest_pending_dbg_exceptions; 1910 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1911 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1912 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1913 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1914 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1915 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1916 /* 1917 * Not present in struct vmcs12: 1918 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1919 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1920 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1921 */ 1922 } 1923 1924 /* 1925 * Not used? 1926 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1927 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1928 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1929 * vmcs12->page_fault_error_code_mask = 1930 * evmcs->page_fault_error_code_mask; 1931 * vmcs12->page_fault_error_code_match = 1932 * evmcs->page_fault_error_code_match; 1933 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1934 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1935 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1936 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1937 */ 1938 1939 /* 1940 * Read only fields: 1941 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1942 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1943 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1944 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1945 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1946 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1947 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1948 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1949 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1950 * vmcs12->exit_qualification = evmcs->exit_qualification; 1951 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1952 * 1953 * Not present in struct vmcs12: 1954 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1955 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1956 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1957 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1958 */ 1959 1960 return; 1961 #else /* CONFIG_KVM_HYPERV */ 1962 KVM_BUG_ON(1, vmx->vcpu.kvm); 1963 #endif /* CONFIG_KVM_HYPERV */ 1964 } 1965 1966 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1967 { 1968 #ifdef CONFIG_KVM_HYPERV 1969 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1970 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1971 1972 /* 1973 * Should not be changed by KVM: 1974 * 1975 * evmcs->host_es_selector = vmcs12->host_es_selector; 1976 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1977 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1978 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1979 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1980 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1981 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1982 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1983 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1984 * evmcs->host_cr0 = vmcs12->host_cr0; 1985 * evmcs->host_cr3 = vmcs12->host_cr3; 1986 * evmcs->host_cr4 = vmcs12->host_cr4; 1987 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1988 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1989 * evmcs->host_rip = vmcs12->host_rip; 1990 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1991 * evmcs->host_fs_base = vmcs12->host_fs_base; 1992 * evmcs->host_gs_base = vmcs12->host_gs_base; 1993 * evmcs->host_tr_base = vmcs12->host_tr_base; 1994 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1995 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1996 * evmcs->host_rsp = vmcs12->host_rsp; 1997 * sync_vmcs02_to_vmcs12() doesn't read these: 1998 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1999 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 2000 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 2001 * evmcs->ept_pointer = vmcs12->ept_pointer; 2002 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 2003 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 2004 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 2005 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 2006 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 2007 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 2008 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 2009 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 2010 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 2011 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 2012 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 2013 * evmcs->page_fault_error_code_mask = 2014 * vmcs12->page_fault_error_code_mask; 2015 * evmcs->page_fault_error_code_match = 2016 * vmcs12->page_fault_error_code_match; 2017 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 2018 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 2019 * evmcs->tsc_offset = vmcs12->tsc_offset; 2020 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 2021 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 2022 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 2023 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 2024 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 2025 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 2026 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 2027 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 2028 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 2029 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 2030 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 2031 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 2032 * 2033 * Not present in struct vmcs12: 2034 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 2035 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 2036 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 2037 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 2038 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 2039 * evmcs->host_ssp = vmcs12->host_ssp; 2040 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 2041 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 2042 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 2043 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 2044 * evmcs->guest_ssp = vmcs12->guest_ssp; 2045 */ 2046 2047 evmcs->guest_es_selector = vmcs12->guest_es_selector; 2048 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 2049 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 2050 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 2051 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 2052 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 2053 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 2054 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 2055 2056 evmcs->guest_es_limit = vmcs12->guest_es_limit; 2057 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 2058 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 2059 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 2060 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 2061 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 2062 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 2063 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 2064 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 2065 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 2066 2067 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 2068 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 2069 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 2070 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 2071 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 2072 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 2073 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 2074 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 2075 2076 evmcs->guest_es_base = vmcs12->guest_es_base; 2077 evmcs->guest_cs_base = vmcs12->guest_cs_base; 2078 evmcs->guest_ss_base = vmcs12->guest_ss_base; 2079 evmcs->guest_ds_base = vmcs12->guest_ds_base; 2080 evmcs->guest_fs_base = vmcs12->guest_fs_base; 2081 evmcs->guest_gs_base = vmcs12->guest_gs_base; 2082 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 2083 evmcs->guest_tr_base = vmcs12->guest_tr_base; 2084 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 2085 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 2086 2087 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 2088 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 2089 2090 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 2091 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 2092 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 2093 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 2094 2095 evmcs->guest_pending_dbg_exceptions = 2096 vmcs12->guest_pending_dbg_exceptions; 2097 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 2098 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 2099 2100 evmcs->guest_activity_state = vmcs12->guest_activity_state; 2101 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 2102 2103 evmcs->guest_cr0 = vmcs12->guest_cr0; 2104 evmcs->guest_cr3 = vmcs12->guest_cr3; 2105 evmcs->guest_cr4 = vmcs12->guest_cr4; 2106 evmcs->guest_dr7 = vmcs12->guest_dr7; 2107 2108 evmcs->guest_physical_address = vmcs12->guest_physical_address; 2109 2110 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 2111 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 2112 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 2113 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 2114 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 2115 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 2116 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 2117 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 2118 2119 evmcs->exit_qualification = vmcs12->exit_qualification; 2120 2121 evmcs->guest_linear_address = vmcs12->guest_linear_address; 2122 evmcs->guest_rsp = vmcs12->guest_rsp; 2123 evmcs->guest_rflags = vmcs12->guest_rflags; 2124 2125 evmcs->guest_interruptibility_info = 2126 vmcs12->guest_interruptibility_info; 2127 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 2128 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 2129 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 2130 evmcs->vm_entry_exception_error_code = 2131 vmcs12->vm_entry_exception_error_code; 2132 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 2133 2134 evmcs->guest_rip = vmcs12->guest_rip; 2135 2136 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 2137 2138 return; 2139 #else /* CONFIG_KVM_HYPERV */ 2140 KVM_BUG_ON(1, vmx->vcpu.kvm); 2141 #endif /* CONFIG_KVM_HYPERV */ 2142 } 2143 2144 /* 2145 * This is an equivalent of the nested hypervisor executing the vmptrld 2146 * instruction. 2147 */ 2148 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2149 struct kvm_vcpu *vcpu, bool from_launch) 2150 { 2151 #ifdef CONFIG_KVM_HYPERV 2152 struct vcpu_vmx *vmx = to_vmx(vcpu); 2153 bool evmcs_gpa_changed = false; 2154 u64 evmcs_gpa; 2155 2156 if (likely(!guest_cpu_cap_has_evmcs(vcpu))) 2157 return EVMPTRLD_DISABLED; 2158 2159 evmcs_gpa = nested_get_evmptr(vcpu); 2160 if (!evmptr_is_valid(evmcs_gpa)) { 2161 nested_release_evmcs(vcpu); 2162 return EVMPTRLD_DISABLED; 2163 } 2164 2165 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2166 vmx->nested.current_vmptr = INVALID_GPA; 2167 2168 nested_release_evmcs(vcpu); 2169 2170 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2171 &vmx->nested.hv_evmcs_map)) 2172 return EVMPTRLD_ERROR; 2173 2174 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2175 2176 /* 2177 * Currently, KVM only supports eVMCS version 1 2178 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2179 * value to first u32 field of eVMCS which should specify eVMCS 2180 * VersionNumber. 2181 * 2182 * Guest should be aware of supported eVMCS versions by host by 2183 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2184 * expected to set this CPUID leaf according to the value 2185 * returned in vmcs_version from nested_enable_evmcs(). 2186 * 2187 * However, it turns out that Microsoft Hyper-V fails to comply 2188 * to their own invented interface: When Hyper-V use eVMCS, it 2189 * just sets first u32 field of eVMCS to revision_id specified 2190 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2191 * which is one of the supported versions specified in 2192 * CPUID.0x4000000A.EAX[0:15]. 2193 * 2194 * To overcome Hyper-V bug, we accept here either a supported 2195 * eVMCS version or VMCS12 revision_id as valid values for first 2196 * u32 field of eVMCS. 2197 */ 2198 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2199 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2200 nested_release_evmcs(vcpu); 2201 return EVMPTRLD_VMFAIL; 2202 } 2203 2204 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2205 2206 evmcs_gpa_changed = true; 2207 /* 2208 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2209 * reloaded from guest's memory (read only fields, fields not 2210 * present in struct hv_enlightened_vmcs, ...). Make sure there 2211 * are no leftovers. 2212 */ 2213 if (from_launch) { 2214 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2215 memset(vmcs12, 0, sizeof(*vmcs12)); 2216 vmcs12->hdr.revision_id = VMCS12_REVISION; 2217 } 2218 2219 } 2220 2221 /* 2222 * Clean fields data can't be used on VMLAUNCH and when we switch 2223 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2224 */ 2225 if (from_launch || evmcs_gpa_changed) { 2226 vmx->nested.hv_evmcs->hv_clean_fields &= 2227 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2228 2229 vmx->nested.force_msr_bitmap_recalc = true; 2230 } 2231 2232 return EVMPTRLD_SUCCEEDED; 2233 #else 2234 return EVMPTRLD_DISABLED; 2235 #endif 2236 } 2237 2238 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2239 { 2240 struct vcpu_vmx *vmx = to_vmx(vcpu); 2241 2242 if (nested_vmx_is_evmptr12_valid(vmx)) 2243 copy_vmcs12_to_enlightened(vmx); 2244 else 2245 copy_vmcs12_to_shadow(vmx); 2246 2247 vmx->nested.need_vmcs12_to_shadow_sync = false; 2248 } 2249 2250 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2251 { 2252 struct vcpu_vmx *vmx = 2253 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2254 2255 vmx->nested.preemption_timer_expired = true; 2256 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2257 kvm_vcpu_kick(&vmx->vcpu); 2258 2259 return HRTIMER_NORESTART; 2260 } 2261 2262 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2263 { 2264 struct vcpu_vmx *vmx = to_vmx(vcpu); 2265 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2266 2267 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2268 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2269 2270 if (!vmx->nested.has_preemption_timer_deadline) { 2271 vmx->nested.preemption_timer_deadline = 2272 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2273 vmx->nested.has_preemption_timer_deadline = true; 2274 } 2275 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2276 } 2277 2278 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2279 u64 preemption_timeout) 2280 { 2281 struct vcpu_vmx *vmx = to_vmx(vcpu); 2282 2283 /* 2284 * A timer value of zero is architecturally guaranteed to cause 2285 * a VMExit prior to executing any instructions in the guest. 2286 */ 2287 if (preemption_timeout == 0) { 2288 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2289 return; 2290 } 2291 2292 if (vcpu->arch.virtual_tsc_khz == 0) 2293 return; 2294 2295 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2296 preemption_timeout *= 1000000; 2297 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2298 hrtimer_start(&vmx->nested.preemption_timer, 2299 ktime_add_ns(ktime_get(), preemption_timeout), 2300 HRTIMER_MODE_ABS_PINNED); 2301 } 2302 2303 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2304 { 2305 if (vmx->vcpu.arch.nested_run_pending && 2306 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2307 return vmcs12->guest_ia32_efer; 2308 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2309 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2310 else 2311 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2312 } 2313 2314 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2315 { 2316 struct kvm *kvm = vmx->vcpu.kvm; 2317 2318 /* 2319 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2320 * according to L0's settings (vmcs12 is irrelevant here). Host 2321 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2322 * will be set as needed prior to VMLAUNCH/VMRESUME. 2323 */ 2324 if (vmx->nested.vmcs02_initialized) 2325 return; 2326 vmx->nested.vmcs02_initialized = true; 2327 2328 if (vmx->ve_info) 2329 vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); 2330 2331 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2332 if (cpu_has_vmx_vmfunc()) 2333 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2334 2335 if (cpu_has_vmx_posted_intr()) 2336 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2337 2338 if (cpu_has_vmx_msr_bitmap()) 2339 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2340 2341 /* 2342 * PML is emulated for L2, but never enabled in hardware as the MMU 2343 * handles A/D emulation. Disabling PML for L2 also avoids having to 2344 * deal with filtering out L2 GPAs from the buffer. 2345 */ 2346 if (enable_pml) { 2347 vmcs_write64(PML_ADDRESS, 0); 2348 vmcs_write16(GUEST_PML_INDEX, -1); 2349 } 2350 2351 if (cpu_has_vmx_encls_vmexit()) 2352 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2353 2354 if (kvm_notify_vmexit_enabled(kvm)) 2355 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2356 2357 /* 2358 * Set the MSR load/store lists to match L0's settings. Only the 2359 * addresses are constant (for vmcs02), the counts can change based 2360 * on L2's behavior, e.g. switching to/from long mode. 2361 */ 2362 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); 2363 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2364 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2365 2366 vmx_set_constant_host_state(vmx); 2367 } 2368 2369 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2370 struct vmcs12 *vmcs12) 2371 { 2372 prepare_vmcs02_constant_state(vmx); 2373 2374 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2375 2376 /* 2377 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 2378 * same VPID as the host. Emulate this behavior by using vpid01 for L2 2379 * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter 2380 * and VM-Exit are architecturally required to flush VPID=0, but *only* 2381 * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the 2382 * required flushes), but doing so would cause KVM to over-flush. E.g. 2383 * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, 2384 * and then runs L2 X again, then KVM can and should retain TLB entries 2385 * for VPID12=1. 2386 */ 2387 if (enable_vpid) { 2388 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2389 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2390 else 2391 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2392 } 2393 } 2394 2395 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2396 struct vmcs12 *vmcs12) 2397 { 2398 u32 exec_control; 2399 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2400 2401 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) 2402 prepare_vmcs02_early_rare(vmx, vmcs12); 2403 2404 /* 2405 * PIN CONTROLS 2406 */ 2407 exec_control = __pin_controls_get(vmcs01); 2408 exec_control |= (vmcs12->pin_based_vm_exec_control & 2409 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2410 2411 /* Posted interrupts setting is only taken from vmcs12. */ 2412 vmx->nested.pi_pending = false; 2413 if (nested_cpu_has_posted_intr(vmcs12)) { 2414 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2415 } else { 2416 vmx->nested.posted_intr_nv = -1; 2417 exec_control &= ~PIN_BASED_POSTED_INTR; 2418 } 2419 pin_controls_set(vmx, exec_control); 2420 2421 /* 2422 * EXEC CONTROLS 2423 */ 2424 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2425 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2426 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2427 exec_control &= ~CPU_BASED_TPR_SHADOW; 2428 exec_control |= vmcs12->cpu_based_vm_exec_control; 2429 2430 if (exec_control & CPU_BASED_TPR_SHADOW) 2431 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2432 #ifdef CONFIG_X86_64 2433 else 2434 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2435 CPU_BASED_CR8_STORE_EXITING; 2436 #endif 2437 2438 /* 2439 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2440 * for I/O port accesses. 2441 */ 2442 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2443 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2444 2445 /* 2446 * This bit will be computed in nested_get_vmcs12_pages, because 2447 * we do not have access to L1's MSR bitmap yet. For now, keep 2448 * the same bit as before, hoping to avoid multiple VMWRITEs that 2449 * only set/clear this bit. 2450 */ 2451 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2452 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2453 2454 exec_controls_set(vmx, exec_control); 2455 2456 /* 2457 * SECONDARY EXEC CONTROLS 2458 */ 2459 if (cpu_has_secondary_exec_ctrls()) { 2460 exec_control = __secondary_exec_controls_get(vmcs01); 2461 2462 /* Take the following fields only from vmcs12 */ 2463 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2464 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2465 SECONDARY_EXEC_ENABLE_INVPCID | 2466 SECONDARY_EXEC_ENABLE_RDTSCP | 2467 SECONDARY_EXEC_ENABLE_XSAVES | 2468 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2469 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2470 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2471 SECONDARY_EXEC_ENABLE_VMFUNC | 2472 SECONDARY_EXEC_MODE_BASED_EPT_EXEC | 2473 SECONDARY_EXEC_DESC); 2474 2475 if (nested_cpu_has(vmcs12, 2476 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2477 exec_control |= vmcs12->secondary_vm_exec_control; 2478 2479 /* PML is emulated and never enabled in hardware for L2. */ 2480 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2481 2482 /* VMCS shadowing for L2 is emulated for now */ 2483 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2484 2485 /* 2486 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2487 * will not have to rewrite the controls just for this bit. 2488 */ 2489 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2490 exec_control |= SECONDARY_EXEC_DESC; 2491 2492 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2493 vmcs_write16(GUEST_INTR_STATUS, 2494 vmcs12->guest_intr_status); 2495 2496 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2497 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2498 2499 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2500 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2501 2502 secondary_exec_controls_set(vmx, exec_control); 2503 } 2504 2505 /* 2506 * ENTRY CONTROLS 2507 * 2508 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2509 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2510 * on the related bits (if supported by the CPU) in the hope that 2511 * we can avoid VMWrites during vmx_set_efer(). 2512 * 2513 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2514 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2515 * do the same for L2. 2516 */ 2517 exec_control = __vm_entry_controls_get(vmcs01); 2518 exec_control |= (vmcs12->vm_entry_controls & 2519 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2520 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2521 if (cpu_has_load_ia32_efer()) { 2522 if (guest_efer & EFER_LMA) 2523 exec_control |= VM_ENTRY_IA32E_MODE; 2524 if (guest_efer != kvm_host.efer) 2525 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2526 } 2527 vm_entry_controls_set(vmx, exec_control); 2528 2529 /* 2530 * EXIT CONTROLS 2531 * 2532 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2533 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2534 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2535 */ 2536 exec_control = __vm_exit_controls_get(vmcs01); 2537 if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) 2538 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2539 else 2540 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2541 vm_exit_controls_set(vmx, exec_control); 2542 2543 /* 2544 * Interrupt/Exception Fields 2545 */ 2546 if (vmx->vcpu.arch.nested_run_pending) { 2547 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2548 vmcs12->vm_entry_intr_info_field); 2549 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2550 vmcs12->vm_entry_exception_error_code); 2551 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2552 vmcs12->vm_entry_instruction_len); 2553 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2554 vmcs12->guest_interruptibility_info); 2555 vmx->loaded_vmcs->nmi_known_unmasked = 2556 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2557 } else { 2558 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2559 } 2560 } 2561 2562 static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet, 2563 u64 *ssp, u64 *ssp_tbl) 2564 { 2565 if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2566 guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2567 *s_cet = vmcs_readl(GUEST_S_CET); 2568 2569 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2570 *ssp = vmcs_readl(GUEST_SSP); 2571 *ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE); 2572 } 2573 } 2574 2575 static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, 2576 u64 ssp, u64 ssp_tbl) 2577 { 2578 if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2579 guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2580 vmcs_writel(GUEST_S_CET, s_cet); 2581 2582 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2583 vmcs_writel(GUEST_SSP, ssp); 2584 vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl); 2585 } 2586 } 2587 2588 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2589 { 2590 struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); 2591 2592 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2593 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2594 2595 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2596 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2597 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2598 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2599 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2600 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2601 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2602 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2603 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2604 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2605 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2606 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2607 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2608 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2609 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2610 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2611 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2612 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2613 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2614 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2615 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2616 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2617 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2618 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2619 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2620 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2621 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2622 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2623 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2624 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2625 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2626 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2627 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2628 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2629 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2630 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2631 2632 vmx_segment_cache_clear(vmx); 2633 } 2634 2635 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2636 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2637 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2638 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2639 vmcs12->guest_pending_dbg_exceptions); 2640 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2641 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2642 2643 /* 2644 * L1 may access the L2's PDPTR, so save them to construct 2645 * vmcs12 2646 */ 2647 if (enable_ept) { 2648 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2649 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2650 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2651 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2652 } 2653 2654 if (kvm_mpx_supported() && vmx->vcpu.arch.nested_run_pending && 2655 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2656 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2657 } 2658 2659 if (nested_cpu_has_xsaves(vmcs12)) 2660 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2661 2662 /* 2663 * Whether page-faults are trapped is determined by a combination of 2664 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2665 * doesn't care about page faults then we should set all of these to 2666 * L1's desires. However, if L0 does care about (some) page faults, it 2667 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2668 * simply ask to exit on each and every L2 page fault. This is done by 2669 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2670 * Note that below we don't need special code to set EB.PF beyond the 2671 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2672 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2673 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2674 */ 2675 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2676 /* 2677 * TODO: if both L0 and L1 need the same MASK and MATCH, 2678 * go ahead and use it? 2679 */ 2680 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2681 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2682 } else { 2683 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2684 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2685 } 2686 2687 if (cpu_has_vmx_apicv()) { 2688 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2689 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2690 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2691 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2692 } 2693 2694 /* 2695 * If vmcs12 is configured to save TSC on exit via the auto-store list, 2696 * append the MSR to vmcs02's auto-store list so that KVM effectively 2697 * reads TSC at the time of VM-Exit from L2. The saved value will be 2698 * propagated to vmcs12's list on nested VM-Exit. 2699 * 2700 * Don't increment the number of MSRs in the vCPU structure, as saving 2701 * TSC is specific to this particular incarnation of vmcb02, i.e. must 2702 * not bleed into vmcs01. 2703 */ 2704 if (nested_msr_store_list_has_msr(&vmx->vcpu, MSR_IA32_TSC) && 2705 !WARN_ON_ONCE(vmx->msr_autostore.nr >= ARRAY_SIZE(vmx->msr_autostore.val))) { 2706 vmx->nested.tsc_autostore_slot = vmx->msr_autostore.nr; 2707 vmx->msr_autostore.val[vmx->msr_autostore.nr].index = MSR_IA32_TSC; 2708 2709 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr + 1); 2710 } else { 2711 vmx->nested.tsc_autostore_slot = -1; 2712 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); 2713 } 2714 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2715 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2716 2717 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) 2718 vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet, 2719 vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); 2720 2721 set_cr4_guest_host_mask(vmx); 2722 } 2723 2724 /* 2725 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2726 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2727 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2728 * guest in a way that will both be appropriate to L1's requests, and our 2729 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2730 * function also has additional necessary side-effects, like setting various 2731 * vcpu->arch fields. 2732 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2733 * is assigned to entry_failure_code on failure. 2734 */ 2735 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2736 bool from_vmentry, 2737 enum vm_entry_failure_code *entry_failure_code) 2738 { 2739 struct vcpu_vmx *vmx = to_vmx(vcpu); 2740 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 2741 bool load_guest_pdptrs_vmcs12 = false; 2742 2743 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { 2744 prepare_vmcs02_rare(vmx, vmcs12); 2745 vmx->nested.dirty_vmcs12 = false; 2746 2747 load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || 2748 !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2749 } 2750 2751 if (vcpu->arch.nested_run_pending && 2752 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2753 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2754 vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & 2755 vmx_get_supported_debugctl(vcpu, false)); 2756 } else { 2757 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2758 vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); 2759 } 2760 2761 if (!vcpu->arch.nested_run_pending || 2762 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 2763 vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, 2764 vmx->nested.pre_vmenter_ssp, 2765 vmx->nested.pre_vmenter_ssp_tbl); 2766 2767 if (kvm_mpx_supported() && (!vcpu->arch.nested_run_pending || 2768 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2769 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2770 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2771 2772 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2773 * bitwise-or of what L1 wants to trap for L2, and what we want to 2774 * trap. Note that CR0.TS also needs updating - we do this later. 2775 */ 2776 vmx_update_exception_bitmap(vcpu); 2777 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2778 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2779 2780 if (vcpu->arch.nested_run_pending && 2781 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2782 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2783 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2784 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2785 vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat); 2786 } 2787 2788 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2789 vcpu->arch.l1_tsc_offset, 2790 vmx_get_l2_tsc_offset(vcpu), 2791 vmx_get_l2_tsc_multiplier(vcpu)); 2792 2793 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2794 vcpu->arch.l1_tsc_scaling_ratio, 2795 vmx_get_l2_tsc_multiplier(vcpu)); 2796 2797 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2798 if (kvm_caps.has_tsc_control) 2799 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2800 2801 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2802 2803 if (nested_cpu_has_ept(vmcs12)) 2804 nested_ept_init_mmu_context(vcpu); 2805 2806 /* 2807 * Override the CR0/CR4 read shadows after setting the effective guest 2808 * CR0/CR4. The common helpers also set the shadows, but they don't 2809 * account for vmcs12's cr0/4_guest_host_mask. 2810 */ 2811 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2812 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2813 2814 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2815 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2816 2817 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2818 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2819 vmx_set_efer(vcpu, vcpu->arch.efer); 2820 2821 /* 2822 * Guest state is invalid and unrestricted guest is disabled, 2823 * which means L1 attempted VMEntry to L2 with invalid state. 2824 * Fail the VMEntry. 2825 * 2826 * However when force loading the guest state (SMM exit or 2827 * loading nested state after migration, it is possible to 2828 * have invalid guest state now, which will be later fixed by 2829 * restoring L2 register state 2830 */ 2831 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2832 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2833 return -EINVAL; 2834 } 2835 2836 /* Shadow page tables on either EPT or shadow page tables. */ 2837 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2838 from_vmentry, entry_failure_code)) 2839 return -EINVAL; 2840 2841 /* 2842 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2843 * on nested VM-Exit, which can occur without actually running L2 and 2844 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2845 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2846 * transition to HLT instead of running L2. 2847 */ 2848 if (enable_ept) 2849 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2850 2851 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2852 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2853 is_pae_paging(vcpu)) { 2854 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2855 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2856 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2857 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2858 } 2859 2860 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2861 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2862 WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2863 vmcs12->guest_ia32_perf_global_ctrl))) { 2864 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2865 return -EINVAL; 2866 } 2867 2868 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2869 kvm_rip_write(vcpu, vmcs12->guest_rip); 2870 2871 /* 2872 * It was observed that genuine Hyper-V running in L1 doesn't reset 2873 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2874 * bits when it changes a field in eVMCS. Mark all fields as clean 2875 * here. 2876 */ 2877 if (nested_vmx_is_evmptr12_valid(vmx)) 2878 evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2879 2880 return 0; 2881 } 2882 2883 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2884 { 2885 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2886 nested_cpu_has_virtual_nmis(vmcs12))) 2887 return -EINVAL; 2888 2889 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2890 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2891 return -EINVAL; 2892 2893 return 0; 2894 } 2895 2896 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2897 { 2898 struct vcpu_vmx *vmx = to_vmx(vcpu); 2899 2900 /* Check for memory type validity */ 2901 switch (new_eptp & VMX_EPTP_MT_MASK) { 2902 case VMX_EPTP_MT_UC: 2903 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2904 return false; 2905 break; 2906 case VMX_EPTP_MT_WB: 2907 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2908 return false; 2909 break; 2910 default: 2911 return false; 2912 } 2913 2914 /* Page-walk levels validity. */ 2915 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2916 case VMX_EPTP_PWL_5: 2917 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2918 return false; 2919 break; 2920 case VMX_EPTP_PWL_4: 2921 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2922 return false; 2923 break; 2924 default: 2925 return false; 2926 } 2927 2928 /* Reserved bits should not be set */ 2929 if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2930 return false; 2931 2932 /* AD, if set, should be supported */ 2933 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2934 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2935 return false; 2936 } 2937 2938 return true; 2939 } 2940 2941 /* 2942 * Checks related to VM-Execution Control Fields 2943 */ 2944 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2945 struct vmcs12 *vmcs12) 2946 { 2947 struct vcpu_vmx *vmx = to_vmx(vcpu); 2948 2949 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2950 vmx->nested.msrs.pinbased_ctls_low, 2951 vmx->nested.msrs.pinbased_ctls_high)) || 2952 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2953 vmx->nested.msrs.procbased_ctls_low, 2954 vmx->nested.msrs.procbased_ctls_high))) 2955 return -EINVAL; 2956 2957 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2958 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2959 vmx->nested.msrs.secondary_ctls_low, 2960 vmx->nested.msrs.secondary_ctls_high))) 2961 return -EINVAL; 2962 2963 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2964 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2965 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2966 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2967 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2968 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2969 nested_vmx_check_nmi_controls(vmcs12) || 2970 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2971 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2972 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2973 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2974 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2975 return -EINVAL; 2976 2977 if (!nested_cpu_has_preemption_timer(vmcs12) && 2978 nested_cpu_has_save_preemption_timer(vmcs12)) 2979 return -EINVAL; 2980 2981 if (nested_cpu_has_ept(vmcs12) && 2982 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2983 return -EINVAL; 2984 2985 if (nested_cpu_has_vmfunc(vmcs12)) { 2986 if (CC(vmcs12->vm_function_control & 2987 ~vmx->nested.msrs.vmfunc_controls)) 2988 return -EINVAL; 2989 2990 if (nested_cpu_has_eptp_switching(vmcs12)) { 2991 if (CC(!nested_cpu_has_ept(vmcs12)) || 2992 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2993 return -EINVAL; 2994 } 2995 } 2996 2997 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) && 2998 CC(!vmcs12->tsc_multiplier)) 2999 return -EINVAL; 3000 3001 return 0; 3002 } 3003 3004 /* 3005 * Checks related to VM-Exit Control Fields 3006 */ 3007 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 3008 struct vmcs12 *vmcs12) 3009 { 3010 struct vcpu_vmx *vmx = to_vmx(vcpu); 3011 3012 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 3013 vmx->nested.msrs.exit_ctls_low, 3014 vmx->nested.msrs.exit_ctls_high)) || 3015 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 3016 return -EINVAL; 3017 3018 return 0; 3019 } 3020 3021 /* 3022 * Checks related to VM-Entry Control Fields 3023 */ 3024 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 3025 struct vmcs12 *vmcs12) 3026 { 3027 struct vcpu_vmx *vmx = to_vmx(vcpu); 3028 3029 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 3030 vmx->nested.msrs.entry_ctls_low, 3031 vmx->nested.msrs.entry_ctls_high))) 3032 return -EINVAL; 3033 3034 /* 3035 * From the Intel SDM, volume 3: 3036 * Fields relevant to VM-entry event injection must be set properly. 3037 * These fields are the VM-entry interruption-information field, the 3038 * VM-entry exception error code, and the VM-entry instruction length. 3039 */ 3040 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 3041 u32 intr_info = vmcs12->vm_entry_intr_info_field; 3042 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 3043 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 3044 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 3045 bool urg = nested_cpu_has2(vmcs12, 3046 SECONDARY_EXEC_UNRESTRICTED_GUEST); 3047 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 3048 3049 /* VM-entry interruption-info field: interruption type */ 3050 if (CC(intr_type == INTR_TYPE_RESERVED) || 3051 CC(intr_type == INTR_TYPE_OTHER_EVENT && 3052 !nested_cpu_supports_monitor_trap_flag(vcpu))) 3053 return -EINVAL; 3054 3055 /* VM-entry interruption-info field: vector */ 3056 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 3057 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 3058 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 3059 return -EINVAL; 3060 3061 /* 3062 * Cannot deliver error code in real mode or if the interrupt 3063 * type is not hardware exception. For other cases, do the 3064 * consistency check only if the vCPU doesn't enumerate 3065 * VMX_BASIC_NO_HW_ERROR_CODE_CC. 3066 */ 3067 if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) { 3068 if (CC(has_error_code)) 3069 return -EINVAL; 3070 } else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) { 3071 if (CC(has_error_code != x86_exception_has_error_code(vector))) 3072 return -EINVAL; 3073 } 3074 3075 /* VM-entry exception error code */ 3076 if (CC(has_error_code && 3077 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 3078 return -EINVAL; 3079 3080 /* VM-entry interruption-info field: reserved bits */ 3081 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 3082 return -EINVAL; 3083 3084 /* VM-entry instruction length */ 3085 switch (intr_type) { 3086 case INTR_TYPE_SOFT_EXCEPTION: 3087 case INTR_TYPE_SOFT_INTR: 3088 case INTR_TYPE_PRIV_SW_EXCEPTION: 3089 if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || 3090 CC(vmcs12->vm_entry_instruction_len == 0 && 3091 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 3092 return -EINVAL; 3093 } 3094 } 3095 3096 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 3097 return -EINVAL; 3098 3099 return 0; 3100 } 3101 3102 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 3103 struct vmcs12 *vmcs12) 3104 { 3105 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 3106 nested_check_vm_exit_controls(vcpu, vmcs12) || 3107 nested_check_vm_entry_controls(vcpu, vmcs12)) 3108 return -EINVAL; 3109 3110 #ifdef CONFIG_KVM_HYPERV 3111 if (guest_cpu_cap_has_evmcs(vcpu)) 3112 return nested_evmcs_check_controls(vmcs12); 3113 #endif 3114 3115 return 0; 3116 } 3117 3118 static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu, 3119 struct vmcs12 *vmcs12) 3120 { 3121 void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva; 3122 u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0; 3123 3124 /* 3125 * Don't bother with the consistency checks if KVM isn't configured to 3126 * WARN on missed consistency checks, as KVM needs to rely on hardware 3127 * to fully detect an illegal vTPR vs. TRP Threshold combination due to 3128 * the vTPR being writable by L1 at all times (it's an in-memory value, 3129 * not a VMCS field). I.e. even if the check passes now, it might fail 3130 * at the actual VM-Enter. 3131 * 3132 * Keying off the module param also allows treating an invalid vAPIC 3133 * mapping as a consistency check failure without increasing the risk 3134 * of breaking a "real" VM. 3135 */ 3136 if (!warn_on_missed_cc) 3137 return 0; 3138 3139 if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) && 3140 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) && 3141 !nested_cpu_has_vid(vmcs12) && 3142 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 3143 (CC(!vapic) || 3144 CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0))))) 3145 return -EINVAL; 3146 3147 return 0; 3148 } 3149 3150 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 3151 struct vmcs12 *vmcs12) 3152 { 3153 #ifdef CONFIG_X86_64 3154 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 3155 !!(vcpu->arch.efer & EFER_LMA))) 3156 return -EINVAL; 3157 #endif 3158 return 0; 3159 } 3160 3161 static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) 3162 { 3163 /* 3164 * Check that the given linear address is canonical after a VM exit 3165 * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. 3166 */ 3167 u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; 3168 3169 return !__is_canonical_address(la, l1_address_bits_on_exit); 3170 } 3171 3172 static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet, 3173 u64 ssp, u64 ssp_tbl) 3174 { 3175 if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) || 3176 CC(is_noncanonical_msr_address(ssp_tbl, vcpu))) 3177 return -EINVAL; 3178 3179 return 0; 3180 } 3181 3182 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 3183 struct vmcs12 *vmcs12) 3184 { 3185 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 3186 3187 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 3188 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 3189 CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) 3190 return -EINVAL; 3191 3192 if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP))) 3193 return -EINVAL; 3194 3195 if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 3196 CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 3197 return -EINVAL; 3198 3199 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 3200 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 3201 return -EINVAL; 3202 3203 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 3204 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3205 vmcs12->host_ia32_perf_global_ctrl))) 3206 return -EINVAL; 3207 3208 if (ia32e) { 3209 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 3210 return -EINVAL; 3211 } else { 3212 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 3213 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 3214 CC((vmcs12->host_rip) >> 32)) 3215 return -EINVAL; 3216 } 3217 3218 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3219 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3220 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3221 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3222 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3223 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3224 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3225 CC(vmcs12->host_cs_selector == 0) || 3226 CC(vmcs12->host_tr_selector == 0) || 3227 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 3228 return -EINVAL; 3229 3230 if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || 3231 CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || 3232 CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || 3233 CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || 3234 CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || 3235 CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) 3236 return -EINVAL; 3237 3238 /* 3239 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 3240 * IA32_EFER MSR must be 0 in the field for that register. In addition, 3241 * the values of the LMA and LME bits in the field must each be that of 3242 * the host address-space size VM-exit control. 3243 */ 3244 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 3245 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 3246 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 3247 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 3248 return -EINVAL; 3249 } 3250 3251 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) { 3252 if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet, 3253 vmcs12->host_ssp, 3254 vmcs12->host_ssp_tbl)) 3255 return -EINVAL; 3256 3257 /* 3258 * IA32_S_CET and SSP must be canonical if the host will 3259 * enter 64-bit mode after VM-exit; otherwise, higher 3260 * 32-bits must be all 0s. 3261 */ 3262 if (ia32e) { 3263 if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) || 3264 CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu))) 3265 return -EINVAL; 3266 } else { 3267 if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32)) 3268 return -EINVAL; 3269 } 3270 } 3271 3272 return 0; 3273 } 3274 3275 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 3276 struct vmcs12 *vmcs12) 3277 { 3278 struct vcpu_vmx *vmx = to_vmx(vcpu); 3279 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 3280 struct vmcs_hdr hdr; 3281 3282 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 3283 return 0; 3284 3285 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 3286 return -EINVAL; 3287 3288 if (ghc->gpa != vmcs12->vmcs_link_pointer && 3289 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 3290 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 3291 return -EINVAL; 3292 3293 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 3294 offsetof(struct vmcs12, hdr), 3295 sizeof(hdr)))) 3296 return -EINVAL; 3297 3298 if (CC(hdr.revision_id != VMCS12_REVISION) || 3299 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3300 return -EINVAL; 3301 3302 return 0; 3303 } 3304 3305 /* 3306 * Checks related to Guest Non-register State 3307 */ 3308 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3309 { 3310 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3311 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3312 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3313 return -EINVAL; 3314 3315 return 0; 3316 } 3317 3318 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3319 struct vmcs12 *vmcs12, 3320 enum vm_entry_failure_code *entry_failure_code) 3321 { 3322 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3323 3324 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3325 3326 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3327 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3328 return -EINVAL; 3329 3330 if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP))) 3331 return -EINVAL; 3332 3333 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 3334 u64 debugctl = vmcs12->guest_ia32_debugctl; 3335 3336 /* 3337 * FREEZE_IN_SMM is not virtualized, but allow L1 to set it in 3338 * vmcs12's DEBUGCTL under a quirk for backwards compatibility. 3339 * Note that the quirk only relaxes the consistency check. The 3340 * vmcc02 bit is still under the control of the host. In 3341 * particular, if a host administrator decides to clear the bit, 3342 * then L1 has no say in the matter. 3343 */ 3344 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM)) 3345 debugctl &= ~DEBUGCTLMSR_FREEZE_IN_SMM; 3346 3347 if (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || 3348 CC(!vmx_is_valid_debugctl(vcpu, debugctl, false))) 3349 return -EINVAL; 3350 } 3351 3352 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3353 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3354 return -EINVAL; 3355 3356 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3357 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3358 return -EINVAL; 3359 } 3360 3361 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3362 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3363 vmcs12->guest_ia32_perf_global_ctrl))) 3364 return -EINVAL; 3365 3366 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3367 return -EINVAL; 3368 3369 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3370 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3371 return -EINVAL; 3372 3373 /* 3374 * If the load IA32_EFER VM-entry control is 1, the following checks 3375 * are performed on the field for the IA32_EFER MSR: 3376 * - Bits reserved in the IA32_EFER MSR must be 0. 3377 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3378 * the IA-32e mode guest VM-exit control. It must also be identical 3379 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3380 * CR0.PG) is 1. 3381 */ 3382 if (vcpu->arch.nested_run_pending && 3383 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3384 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3385 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3386 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3387 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3388 return -EINVAL; 3389 } 3390 3391 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3392 (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3393 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3394 return -EINVAL; 3395 3396 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) { 3397 if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet, 3398 vmcs12->guest_ssp, 3399 vmcs12->guest_ssp_tbl)) 3400 return -EINVAL; 3401 3402 /* 3403 * Guest SSP must have 63:N bits identical, rather than 3404 * be canonical (i.e., 63:N-1 bits identical), where N is 3405 * the CPU's maximum linear-address width. Similar to 3406 * is_noncanonical_msr_address(), use the host's 3407 * linear-address width. 3408 */ 3409 if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1))) 3410 return -EINVAL; 3411 } 3412 3413 if (nested_check_guest_non_reg_state(vmcs12)) 3414 return -EINVAL; 3415 3416 return 0; 3417 } 3418 3419 #ifdef CONFIG_KVM_HYPERV 3420 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3421 { 3422 struct vcpu_vmx *vmx = to_vmx(vcpu); 3423 3424 /* 3425 * hv_evmcs may end up being not mapped after migration (when 3426 * L2 was running), map it here to make sure vmcs12 changes are 3427 * properly reflected. 3428 */ 3429 if (guest_cpu_cap_has_evmcs(vcpu) && 3430 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3431 enum nested_evmptrld_status evmptrld_status = 3432 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3433 3434 if (evmptrld_status == EVMPTRLD_VMFAIL || 3435 evmptrld_status == EVMPTRLD_ERROR) 3436 return false; 3437 3438 /* 3439 * Post migration VMCS12 always provides the most actual 3440 * information, copy it to eVMCS upon entry. 3441 */ 3442 vmx->nested.need_vmcs12_to_shadow_sync = true; 3443 } 3444 3445 return true; 3446 } 3447 #endif 3448 3449 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3450 { 3451 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3452 struct vcpu_vmx *vmx = to_vmx(vcpu); 3453 struct kvm_host_map *map; 3454 3455 if (!vcpu->arch.pdptrs_from_userspace && 3456 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3457 /* 3458 * Reload the guest's PDPTRs since after a migration 3459 * the guest CR3 might be restored prior to setting the nested 3460 * state which can lead to a load of wrong PDPTRs. 3461 */ 3462 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3463 return false; 3464 } 3465 3466 3467 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3468 map = &vmx->nested.apic_access_page_map; 3469 3470 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3471 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3472 } else { 3473 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3474 __func__); 3475 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3476 vcpu->run->internal.suberror = 3477 KVM_INTERNAL_ERROR_EMULATION; 3478 vcpu->run->internal.ndata = 0; 3479 return false; 3480 } 3481 } 3482 3483 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3484 map = &vmx->nested.virtual_apic_map; 3485 3486 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3487 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3488 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3489 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3490 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3491 /* 3492 * The processor will never use the TPR shadow, simply 3493 * clear the bit from the execution control. Such a 3494 * configuration is useless, but it happens in tests. 3495 * For any other configuration, failing the vm entry is 3496 * _not_ what the processor does but it's basically the 3497 * only possibility we have. 3498 */ 3499 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3500 } else { 3501 /* 3502 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3503 * force VM-Entry to fail. 3504 */ 3505 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3506 } 3507 } 3508 3509 if (nested_cpu_has_posted_intr(vmcs12)) { 3510 map = &vmx->nested.pi_desc_map; 3511 3512 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3513 vmx->nested.pi_desc = 3514 (struct pi_desc *)(((void *)map->hva) + 3515 offset_in_page(vmcs12->posted_intr_desc_addr)); 3516 vmcs_write64(POSTED_INTR_DESC_ADDR, 3517 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3518 } else { 3519 /* 3520 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3521 * access the contents of the VMCS12 posted interrupt 3522 * descriptor. (Note that KVM may do this when it 3523 * should not, per the architectural specification.) 3524 */ 3525 vmx->nested.pi_desc = NULL; 3526 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3527 } 3528 } 3529 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3530 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3531 else 3532 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3533 3534 return true; 3535 } 3536 3537 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3538 { 3539 #ifdef CONFIG_KVM_HYPERV 3540 /* 3541 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3542 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3543 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3544 * migration. 3545 */ 3546 if (!nested_get_evmcs_page(vcpu)) { 3547 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3548 __func__); 3549 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3550 vcpu->run->internal.suberror = 3551 KVM_INTERNAL_ERROR_EMULATION; 3552 vcpu->run->internal.ndata = 0; 3553 3554 return false; 3555 } 3556 #endif 3557 3558 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3559 return false; 3560 3561 return true; 3562 } 3563 3564 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3565 { 3566 struct vmcs12 *vmcs12; 3567 struct vcpu_vmx *vmx = to_vmx(vcpu); 3568 gpa_t dst; 3569 3570 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3571 return 0; 3572 3573 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3574 return 1; 3575 3576 /* 3577 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3578 * set is already checked as part of A/D emulation. 3579 */ 3580 vmcs12 = get_vmcs12(vcpu); 3581 if (!nested_cpu_has_pml(vmcs12)) 3582 return 0; 3583 3584 if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { 3585 vmx->nested.pml_full = true; 3586 return 1; 3587 } 3588 3589 gpa &= ~0xFFFull; 3590 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3591 3592 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3593 offset_in_page(dst), sizeof(gpa))) 3594 return 0; 3595 3596 vmcs12->guest_pml_index--; 3597 3598 return 0; 3599 } 3600 3601 /* 3602 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3603 * for running VMX instructions (except VMXON, whose prerequisites are 3604 * slightly different). It also specifies what exception to inject otherwise. 3605 * Note that many of these exceptions have priority over VM exits, so they 3606 * don't have to be checked again here. 3607 */ 3608 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3609 { 3610 if (!to_vmx(vcpu)->nested.vmxon) { 3611 kvm_queue_exception(vcpu, UD_VECTOR); 3612 return 0; 3613 } 3614 3615 if (vmx_get_cpl(vcpu)) { 3616 kvm_inject_gp(vcpu, 0); 3617 return 0; 3618 } 3619 3620 return 1; 3621 } 3622 3623 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3624 struct vmcs12 *vmcs12); 3625 3626 /* 3627 * If from_vmentry is false, this is being called from state restore (either RSM 3628 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3629 * 3630 * Returns: 3631 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3632 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3633 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3634 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3635 */ 3636 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3637 bool from_vmentry) 3638 { 3639 struct vcpu_vmx *vmx = to_vmx(vcpu); 3640 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3641 enum vm_entry_failure_code entry_failure_code; 3642 union vmx_exit_reason exit_reason = { 3643 .basic = EXIT_REASON_INVALID_STATE, 3644 .failed_vmentry = 1, 3645 }; 3646 u32 failed_index; 3647 3648 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3649 vmx->nested.current_vmptr, 3650 vmcs12->guest_rip, 3651 vmcs12->guest_intr_status, 3652 vmcs12->vm_entry_intr_info_field, 3653 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3654 vmcs12->ept_pointer, 3655 vmcs12->guest_cr3, 3656 KVM_ISA_VMX); 3657 3658 kvm_service_local_tlb_flush_requests(vcpu); 3659 3660 if (!vcpu->arch.nested_run_pending || 3661 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3662 vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); 3663 if (kvm_mpx_supported() && 3664 (!vcpu->arch.nested_run_pending || 3665 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3666 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3667 3668 if (!vcpu->arch.nested_run_pending || 3669 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 3670 vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, 3671 &vmx->nested.pre_vmenter_ssp, 3672 &vmx->nested.pre_vmenter_ssp_tbl); 3673 3674 /* 3675 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the 3676 * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but 3677 * not KVM, KVM must unwind its software model to the pre-VM-Entry host 3678 * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not 3679 * L1's "real" CR3, which causes nested_vmx_restore_host_state() to 3680 * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the 3681 * unwind naturally setting arch.cr3 to the correct value. Smashing 3682 * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind, 3683 * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be 3684 * overwritten with a shadow CR3 prior to re-entering L1. 3685 */ 3686 if (!enable_ept) 3687 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3688 3689 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3690 3691 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3692 3693 if (from_vmentry) { 3694 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3695 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3696 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3697 } 3698 3699 if (nested_vmx_check_controls_late(vcpu, vmcs12)) { 3700 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3701 return NVMX_VMENTRY_VMFAIL; 3702 } 3703 3704 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3705 &entry_failure_code)) { 3706 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3707 vmcs12->exit_qualification = entry_failure_code; 3708 goto vmentry_fail_vmexit; 3709 } 3710 } 3711 3712 enter_guest_mode(vcpu); 3713 3714 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3715 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3716 vmcs12->exit_qualification = entry_failure_code; 3717 goto vmentry_fail_vmexit_guest_mode; 3718 } 3719 3720 if (from_vmentry) { 3721 failed_index = nested_vmx_load_msr(vcpu, 3722 vmcs12->vm_entry_msr_load_addr, 3723 vmcs12->vm_entry_msr_load_count); 3724 if (failed_index) { 3725 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3726 vmcs12->exit_qualification = failed_index; 3727 goto vmentry_fail_vmexit_guest_mode; 3728 } 3729 } else { 3730 /* 3731 * The MMU is not initialized to point at the right entities yet and 3732 * "get pages" would need to read data from the guest (i.e. we will 3733 * need to perform gpa to hpa translation). Request a call 3734 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3735 * have already been set at vmentry time and should not be reset. 3736 */ 3737 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3738 } 3739 3740 /* 3741 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3742 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3743 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3744 * unconditionally. Take care to pull data from vmcs01 as appropriate, 3745 * e.g. when checking for interrupt windows, as vmcs02 is now loaded. 3746 */ 3747 if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | 3748 CPU_BASED_NMI_WINDOW_EXITING)) || 3749 kvm_apic_has_pending_init_or_sipi(vcpu) || 3750 kvm_apic_has_interrupt(vcpu)) 3751 kvm_make_request(KVM_REQ_EVENT, vcpu); 3752 3753 /* 3754 * Do not start the preemption timer hrtimer until after we know 3755 * we are successful, so that only nested_vmx_vmexit needs to cancel 3756 * the timer. 3757 */ 3758 vmx->nested.preemption_timer_expired = false; 3759 if (nested_cpu_has_preemption_timer(vmcs12)) { 3760 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3761 vmx_start_preemption_timer(vcpu, timer_value); 3762 } 3763 3764 /* 3765 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3766 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3767 * returned as far as L1 is concerned. It will only return (and set 3768 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3769 */ 3770 return NVMX_VMENTRY_SUCCESS; 3771 3772 /* 3773 * A failed consistency check that leads to a VMExit during L1's 3774 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3775 * 26.7 "VM-entry failures during or after loading guest state". 3776 */ 3777 vmentry_fail_vmexit_guest_mode: 3778 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3779 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3780 leave_guest_mode(vcpu); 3781 3782 vmentry_fail_vmexit: 3783 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3784 3785 if (!from_vmentry) 3786 return NVMX_VMENTRY_VMEXIT; 3787 3788 load_vmcs12_host_state(vcpu, vmcs12); 3789 vmcs12->vm_exit_reason = exit_reason.full; 3790 if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) 3791 vmx->nested.need_vmcs12_to_shadow_sync = true; 3792 return NVMX_VMENTRY_VMEXIT; 3793 } 3794 3795 /* 3796 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3797 * for running an L2 nested guest. 3798 */ 3799 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3800 { 3801 struct vmcs12 *vmcs12; 3802 enum nvmx_vmentry_status status; 3803 struct vcpu_vmx *vmx = to_vmx(vcpu); 3804 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3805 enum nested_evmptrld_status evmptrld_status; 3806 3807 if (!nested_vmx_check_permission(vcpu)) 3808 return 1; 3809 3810 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3811 if (evmptrld_status == EVMPTRLD_ERROR) { 3812 kvm_queue_exception(vcpu, UD_VECTOR); 3813 return 1; 3814 } 3815 3816 kvm_pmu_branch_retired(vcpu); 3817 3818 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3819 return nested_vmx_failInvalid(vcpu); 3820 3821 if (CC(!nested_vmx_is_evmptr12_valid(vmx) && 3822 vmx->nested.current_vmptr == INVALID_GPA)) 3823 return nested_vmx_failInvalid(vcpu); 3824 3825 vmcs12 = get_vmcs12(vcpu); 3826 3827 /* 3828 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3829 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3830 * rather than RFLAGS.ZF, and no error number is stored to the 3831 * VM-instruction error field. 3832 */ 3833 if (CC(vmcs12->hdr.shadow_vmcs)) 3834 return nested_vmx_failInvalid(vcpu); 3835 3836 if (nested_vmx_is_evmptr12_valid(vmx)) { 3837 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 3838 3839 copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); 3840 /* Enlightened VMCS doesn't have launch state */ 3841 vmcs12->launch_state = !launch; 3842 } else if (enable_shadow_vmcs) { 3843 copy_shadow_to_vmcs12(vmx); 3844 } 3845 3846 /* 3847 * The nested entry process starts with enforcing various prerequisites 3848 * on vmcs12 as required by the Intel SDM, and act appropriately when 3849 * they fail: As the SDM explains, some conditions should cause the 3850 * instruction to fail, while others will cause the instruction to seem 3851 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3852 * To speed up the normal (success) code path, we should avoid checking 3853 * for misconfigurations which will anyway be caught by the processor 3854 * when using the merged vmcs02. 3855 */ 3856 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3857 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3858 3859 if (CC(vmcs12->launch_state == launch)) 3860 return nested_vmx_fail(vcpu, 3861 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3862 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3863 3864 if (nested_vmx_check_controls(vcpu, vmcs12)) 3865 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3866 3867 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3868 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3869 3870 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3871 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3872 3873 /* 3874 * We're finally done with prerequisite checking, and can start with 3875 * the nested entry. 3876 */ 3877 vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; 3878 vmx->nested.has_preemption_timer_deadline = false; 3879 status = nested_vmx_enter_non_root_mode(vcpu, true); 3880 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3881 goto vmentry_failed; 3882 3883 /* Hide L1D cache contents from the nested guest. */ 3884 kvm_request_l1tf_flush_l1d(); 3885 3886 /* 3887 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3888 * also be used as part of restoring nVMX state for 3889 * snapshot restore (migration). 3890 * 3891 * In this flow, it is assumed that vmcs12 cache was 3892 * transferred as part of captured nVMX state and should 3893 * therefore not be read from guest memory (which may not 3894 * exist on destination host yet). 3895 */ 3896 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3897 3898 switch (vmcs12->guest_activity_state) { 3899 case GUEST_ACTIVITY_HLT: 3900 /* 3901 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3902 * awakened by event injection or by an NMI-window VM-exit or 3903 * by an interrupt-window VM-exit, halt the vcpu. 3904 */ 3905 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3906 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3907 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3908 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3909 vcpu->arch.nested_run_pending = 0; 3910 return kvm_emulate_halt_noskip(vcpu); 3911 } 3912 break; 3913 case GUEST_ACTIVITY_WAIT_SIPI: 3914 vcpu->arch.nested_run_pending = 0; 3915 kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); 3916 break; 3917 default: 3918 break; 3919 } 3920 3921 return 1; 3922 3923 vmentry_failed: 3924 vcpu->arch.nested_run_pending = 0; 3925 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3926 return 0; 3927 if (status == NVMX_VMENTRY_VMEXIT) 3928 return 1; 3929 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3930 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3931 } 3932 3933 /* 3934 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3935 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3936 * This function returns the new value we should put in vmcs12.guest_cr0. 3937 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3938 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3939 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3940 * didn't trap the bit, because if L1 did, so would L0). 3941 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3942 * been modified by L2, and L1 knows it. So just leave the old value of 3943 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3944 * isn't relevant, because if L0 traps this bit it can set it to anything. 3945 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3946 * changed these bits, and therefore they need to be updated, but L0 3947 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3948 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3949 */ 3950 static inline unsigned long 3951 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3952 { 3953 return 3954 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3955 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3956 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3957 vcpu->arch.cr0_guest_owned_bits)); 3958 } 3959 3960 static inline unsigned long 3961 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3962 { 3963 return 3964 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3965 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3966 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3967 vcpu->arch.cr4_guest_owned_bits)); 3968 } 3969 3970 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3971 struct vmcs12 *vmcs12, 3972 u32 vm_exit_reason, u32 exit_intr_info) 3973 { 3974 u32 idt_vectoring; 3975 unsigned int nr; 3976 3977 /* 3978 * Per the SDM, VM-Exits due to double and triple faults are never 3979 * considered to occur during event delivery, even if the double/triple 3980 * fault is the result of an escalating vectoring issue. 3981 * 3982 * Note, the SDM qualifies the double fault behavior with "The original 3983 * event results in a double-fault exception". It's unclear why the 3984 * qualification exists since exits due to double fault can occur only 3985 * while vectoring a different exception (injected events are never 3986 * subject to interception), i.e. there's _always_ an original event. 3987 * 3988 * The SDM also uses NMI as a confusing example for the "original event 3989 * causes the VM exit directly" clause. NMI isn't special in any way, 3990 * the same rule applies to all events that cause an exit directly. 3991 * NMI is an odd choice for the example because NMIs can only occur on 3992 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3993 */ 3994 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3995 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3996 is_double_fault(exit_intr_info))) { 3997 vmcs12->idt_vectoring_info_field = 0; 3998 } else if (vcpu->arch.exception.injected) { 3999 nr = vcpu->arch.exception.vector; 4000 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 4001 4002 if (kvm_exception_is_soft(nr)) { 4003 vmcs12->vm_exit_instruction_len = 4004 vcpu->arch.event_exit_inst_len; 4005 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 4006 } else 4007 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 4008 4009 if (vcpu->arch.exception.has_error_code) { 4010 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 4011 vmcs12->idt_vectoring_error_code = 4012 vcpu->arch.exception.error_code; 4013 } 4014 4015 vmcs12->idt_vectoring_info_field = idt_vectoring; 4016 } else if (vcpu->arch.nmi_injected) { 4017 vmcs12->idt_vectoring_info_field = 4018 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 4019 } else if (vcpu->arch.interrupt.injected) { 4020 nr = vcpu->arch.interrupt.nr; 4021 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 4022 4023 if (vcpu->arch.interrupt.soft) { 4024 idt_vectoring |= INTR_TYPE_SOFT_INTR; 4025 vmcs12->vm_entry_instruction_len = 4026 vcpu->arch.event_exit_inst_len; 4027 } else 4028 idt_vectoring |= INTR_TYPE_EXT_INTR; 4029 4030 vmcs12->idt_vectoring_info_field = idt_vectoring; 4031 } else { 4032 vmcs12->idt_vectoring_info_field = 0; 4033 } 4034 } 4035 4036 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 4037 { 4038 struct vcpu_vmx *vmx = to_vmx(vcpu); 4039 int max_irr; 4040 void *vapic_page; 4041 u16 status; 4042 4043 if (!vmx->nested.pi_pending) 4044 return 0; 4045 4046 if (!vmx->nested.pi_desc) 4047 goto mmio_needed; 4048 4049 vmx->nested.pi_pending = false; 4050 4051 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 4052 return 0; 4053 4054 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4055 if (max_irr > 0) { 4056 vapic_page = vmx->nested.virtual_apic_map.hva; 4057 if (!vapic_page) 4058 goto mmio_needed; 4059 4060 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 4061 vapic_page, &max_irr); 4062 status = vmcs_read16(GUEST_INTR_STATUS); 4063 if ((u8)max_irr > ((u8)status & 0xff)) { 4064 status &= ~0xff; 4065 status |= (u8)max_irr; 4066 vmcs_write16(GUEST_INTR_STATUS, status); 4067 } 4068 } 4069 4070 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map); 4071 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map); 4072 return 0; 4073 4074 mmio_needed: 4075 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 4076 return -ENXIO; 4077 } 4078 4079 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 4080 { 4081 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 4082 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 4083 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4084 unsigned long exit_qual; 4085 4086 if (ex->has_payload) { 4087 exit_qual = ex->payload; 4088 } else if (ex->vector == PF_VECTOR) { 4089 exit_qual = vcpu->arch.cr2; 4090 } else if (ex->vector == DB_VECTOR) { 4091 exit_qual = vcpu->arch.dr6; 4092 exit_qual &= ~DR6_BT; 4093 exit_qual ^= DR6_ACTIVE_LOW; 4094 } else { 4095 exit_qual = 0; 4096 } 4097 4098 /* 4099 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 4100 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 4101 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 4102 */ 4103 if (ex->has_error_code && is_protmode(vcpu)) { 4104 /* 4105 * Intel CPUs do not generate error codes with bits 31:16 set, 4106 * and more importantly VMX disallows setting bits 31:16 in the 4107 * injected error code for VM-Entry. Drop the bits to mimic 4108 * hardware and avoid inducing failure on nested VM-Entry if L1 4109 * chooses to inject the exception back to L2. AMD CPUs _do_ 4110 * generate "full" 32-bit error codes, so KVM allows userspace 4111 * to inject exception error codes with bits 31:16 set. 4112 */ 4113 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 4114 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 4115 } 4116 4117 if (kvm_exception_is_soft(ex->vector)) 4118 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 4119 else 4120 intr_info |= INTR_TYPE_HARD_EXCEPTION; 4121 4122 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 4123 vmx_get_nmi_mask(vcpu)) 4124 intr_info |= INTR_INFO_UNBLOCK_NMI; 4125 4126 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 4127 } 4128 4129 /* 4130 * Returns true if a debug trap is (likely) pending delivery. Infer the class 4131 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 4132 * Using the payload is flawed because code breakpoints (fault-like) and data 4133 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 4134 * this will return false positives if a to-be-injected code breakpoint #DB is 4135 * pending (from KVM's perspective, but not "pending" across an instruction 4136 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 4137 * too is trap-like. 4138 * 4139 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 4140 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 4141 * #DB has already happened), and MTF isn't marked pending on code breakpoints 4142 * from the emulator (because such #DBs are fault-like and thus don't trigger 4143 * actions that fire on instruction retire). 4144 */ 4145 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 4146 { 4147 if (!ex->pending || ex->vector != DB_VECTOR) 4148 return 0; 4149 4150 /* General Detect #DBs are always fault-like. */ 4151 return ex->payload & ~DR6_BD; 4152 } 4153 4154 /* 4155 * Returns true if there's a pending #DB exception that is lower priority than 4156 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 4157 * KVM, but could theoretically be injected by userspace. Note, this code is 4158 * imperfect, see above. 4159 */ 4160 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 4161 { 4162 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 4163 } 4164 4165 /* 4166 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 4167 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 4168 * represents these debug traps with a payload that is said to be compatible 4169 * with the 'pending debug exceptions' field, write the payload to the VMCS 4170 * field if a VM-exit is delivered before the debug trap. 4171 */ 4172 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 4173 { 4174 unsigned long pending_dbg; 4175 4176 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 4177 if (pending_dbg) 4178 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 4179 } 4180 4181 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 4182 { 4183 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 4184 to_vmx(vcpu)->nested.preemption_timer_expired; 4185 } 4186 4187 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) 4188 { 4189 struct vcpu_vmx *vmx = to_vmx(vcpu); 4190 void *vapic = vmx->nested.virtual_apic_map.hva; 4191 int max_irr, vppr; 4192 4193 if (nested_vmx_preemption_timer_pending(vcpu) || 4194 vmx->nested.mtf_pending) 4195 return true; 4196 4197 /* 4198 * Virtual Interrupt Delivery doesn't require manual injection. Either 4199 * the interrupt is already in GUEST_RVI and will be recognized by CPU 4200 * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move 4201 * the interrupt from the PIR to RVI prior to entering the guest. 4202 */ 4203 if (for_injection) 4204 return false; 4205 4206 if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || 4207 __vmx_interrupt_blocked(vcpu)) 4208 return false; 4209 4210 if (!vapic) 4211 return false; 4212 4213 vppr = *((u32 *)(vapic + APIC_PROCPRI)); 4214 4215 max_irr = vmx_get_rvi(); 4216 if ((max_irr & 0xf0) > (vppr & 0xf0)) 4217 return true; 4218 4219 if (vmx->nested.pi_pending && vmx->nested.pi_desc && 4220 pi_test_on(vmx->nested.pi_desc)) { 4221 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4222 if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) 4223 return true; 4224 } 4225 4226 return false; 4227 } 4228 4229 /* 4230 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 4231 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 4232 * and less minor edits to splice in the priority of VMX Non-Root specific 4233 * events, e.g. MTF and NMI/INTR-window exiting. 4234 * 4235 * 1 Hardware Reset and Machine Checks 4236 * - RESET 4237 * - Machine Check 4238 * 4239 * 2 Trap on Task Switch 4240 * - T flag in TSS is set (on task switch) 4241 * 4242 * 3 External Hardware Interventions 4243 * - FLUSH 4244 * - STOPCLK 4245 * - SMI 4246 * - INIT 4247 * 4248 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 4249 * 4250 * 4 Traps on Previous Instruction 4251 * - Breakpoints 4252 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 4253 * breakpoint, or #DB due to a split-lock access) 4254 * 4255 * 4.3 VMX-preemption timer expired VM-exit 4256 * 4257 * 4.6 NMI-window exiting VM-exit[2] 4258 * 4259 * 5 Nonmaskable Interrupts (NMI) 4260 * 4261 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4262 * 4263 * 6 Maskable Hardware Interrupts 4264 * 4265 * 7 Code Breakpoint Fault 4266 * 4267 * 8 Faults from Fetching Next Instruction 4268 * - Code-Segment Limit Violation 4269 * - Code Page Fault 4270 * - Control protection exception (missing ENDBRANCH at target of indirect 4271 * call or jump) 4272 * 4273 * 9 Faults from Decoding Next Instruction 4274 * - Instruction length > 15 bytes 4275 * - Invalid Opcode 4276 * - Coprocessor Not Available 4277 * 4278 *10 Faults on Executing Instruction 4279 * - Overflow 4280 * - Bound error 4281 * - Invalid TSS 4282 * - Segment Not Present 4283 * - Stack fault 4284 * - General Protection 4285 * - Data Page Fault 4286 * - Alignment Check 4287 * - x86 FPU Floating-point exception 4288 * - SIMD floating-point exception 4289 * - Virtualization exception 4290 * - Control protection exception 4291 * 4292 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4293 * INIT signals, and higher priority events take priority over MTF VM exits. 4294 * MTF VM exits take priority over debug-trap exceptions and lower priority 4295 * events. 4296 * 4297 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4298 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4299 * timer take priority over VM exits caused by the "NMI-window exiting" 4300 * VM-execution control and lower priority events. 4301 * 4302 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4303 * caused by "NMI-window exiting". VM exits caused by this control take 4304 * priority over non-maskable interrupts (NMIs) and lower priority events. 4305 * 4306 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4307 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4308 * non-maskable interrupts (NMIs) and higher priority events take priority over 4309 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4310 * priority over external interrupts and lower priority events. 4311 */ 4312 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4313 { 4314 struct kvm_lapic *apic = vcpu->arch.apic; 4315 struct vcpu_vmx *vmx = to_vmx(vcpu); 4316 /* 4317 * Only a pending nested run blocks a pending exception. If there is a 4318 * previously injected event, the pending exception occurred while said 4319 * event was being delivered and thus needs to be handled. 4320 */ 4321 bool block_nested_exceptions = vcpu->arch.nested_run_pending; 4322 /* 4323 * Events that don't require injection, i.e. that are virtualized by 4324 * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need 4325 * to regain control in order to deliver the event, and hardware will 4326 * handle event ordering, e.g. with respect to injected exceptions. 4327 * 4328 * But, new events (not exceptions) are only recognized at instruction 4329 * boundaries. If an event needs reinjection, then KVM is handling a 4330 * VM-Exit that occurred _during_ instruction execution; new events, 4331 * irrespective of whether or not they're injected, are blocked until 4332 * the instruction completes. 4333 */ 4334 bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); 4335 /* 4336 * Inject events are blocked by nested VM-Enter, as KVM is responsible 4337 * for managing priority between concurrent events, i.e. KVM needs to 4338 * wait until after VM-Enter completes to deliver injected events. 4339 */ 4340 bool block_nested_events = block_nested_exceptions || 4341 block_non_injected_events; 4342 4343 if (lapic_in_kernel(vcpu) && 4344 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4345 if (block_nested_events) 4346 return -EBUSY; 4347 nested_vmx_update_pending_dbg(vcpu); 4348 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4349 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4350 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4351 4352 /* MTF is discarded if the vCPU is in WFS. */ 4353 vmx->nested.mtf_pending = false; 4354 return 0; 4355 } 4356 4357 if (lapic_in_kernel(vcpu) && 4358 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4359 if (block_nested_events) 4360 return -EBUSY; 4361 4362 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4363 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4364 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4365 apic->sipi_vector & 0xFFUL); 4366 return 0; 4367 } 4368 /* Fallthrough, the SIPI is completely ignored. */ 4369 } 4370 4371 /* 4372 * Process exceptions that are higher priority than Monitor Trap Flag: 4373 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4374 * could theoretically come in from userspace), and ICEBP (INT1). 4375 * 4376 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4377 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4378 * across SMI/RSM as it should; that needs to be addressed in order to 4379 * prioritize SMI over MTF and trap-like #DBs. 4380 */ 4381 if (vcpu->arch.exception_vmexit.pending && 4382 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4383 if (block_nested_exceptions) 4384 return -EBUSY; 4385 4386 nested_vmx_inject_exception_vmexit(vcpu); 4387 return 0; 4388 } 4389 4390 if (vcpu->arch.exception.pending && 4391 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4392 if (block_nested_exceptions) 4393 return -EBUSY; 4394 goto no_vmexit; 4395 } 4396 4397 if (vmx->nested.mtf_pending) { 4398 if (block_nested_events) 4399 return -EBUSY; 4400 nested_vmx_update_pending_dbg(vcpu); 4401 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4402 return 0; 4403 } 4404 4405 if (vcpu->arch.exception_vmexit.pending) { 4406 if (block_nested_exceptions) 4407 return -EBUSY; 4408 4409 nested_vmx_inject_exception_vmexit(vcpu); 4410 return 0; 4411 } 4412 4413 if (vcpu->arch.exception.pending) { 4414 if (block_nested_exceptions) 4415 return -EBUSY; 4416 goto no_vmexit; 4417 } 4418 4419 if (nested_vmx_preemption_timer_pending(vcpu)) { 4420 if (block_nested_events) 4421 return -EBUSY; 4422 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4423 return 0; 4424 } 4425 4426 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4427 if (block_nested_events) 4428 return -EBUSY; 4429 goto no_vmexit; 4430 } 4431 4432 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4433 if (block_nested_events) 4434 return -EBUSY; 4435 if (!nested_exit_on_nmi(vcpu)) 4436 goto no_vmexit; 4437 4438 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4439 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4440 INTR_INFO_VALID_MASK, 0); 4441 /* 4442 * The NMI-triggered VM exit counts as injection: 4443 * clear this one and block further NMIs. 4444 */ 4445 vcpu->arch.nmi_pending = 0; 4446 vmx_set_nmi_mask(vcpu, true); 4447 return 0; 4448 } 4449 4450 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4451 int irq; 4452 4453 if (!nested_exit_on_intr(vcpu)) { 4454 if (block_nested_events) 4455 return -EBUSY; 4456 4457 goto no_vmexit; 4458 } 4459 4460 if (!nested_exit_intr_ack_set(vcpu)) { 4461 if (block_nested_events) 4462 return -EBUSY; 4463 4464 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4465 return 0; 4466 } 4467 4468 irq = kvm_cpu_get_extint(vcpu); 4469 if (irq != -1) { 4470 if (block_nested_events) 4471 return -EBUSY; 4472 4473 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4474 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4475 return 0; 4476 } 4477 4478 irq = kvm_apic_has_interrupt(vcpu); 4479 if (WARN_ON_ONCE(irq < 0)) 4480 goto no_vmexit; 4481 4482 /* 4483 * If the IRQ is L2's PI notification vector, process posted 4484 * interrupts for L2 instead of injecting VM-Exit, as the 4485 * detection/morphing architecturally occurs when the IRQ is 4486 * delivered to the CPU. Note, only interrupts that are routed 4487 * through the local APIC trigger posted interrupt processing, 4488 * and enabling posted interrupts requires ACK-on-exit. 4489 */ 4490 if (irq == vmx->nested.posted_intr_nv) { 4491 /* 4492 * Nested posted interrupts are delivered via RVI, i.e. 4493 * aren't injected by KVM, and so can be queued even if 4494 * manual event injection is disallowed. 4495 */ 4496 if (block_non_injected_events) 4497 return -EBUSY; 4498 4499 vmx->nested.pi_pending = true; 4500 kvm_apic_clear_irr(vcpu, irq); 4501 goto no_vmexit; 4502 } 4503 4504 if (block_nested_events) 4505 return -EBUSY; 4506 4507 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4508 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4509 4510 /* 4511 * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must 4512 * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI 4513 * if APICv is active. 4514 */ 4515 kvm_apic_ack_interrupt(vcpu, irq); 4516 return 0; 4517 } 4518 4519 no_vmexit: 4520 return vmx_complete_nested_posted_interrupt(vcpu); 4521 } 4522 4523 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4524 { 4525 ktime_t remaining = 4526 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4527 u64 value; 4528 4529 if (ktime_to_ns(remaining) <= 0) 4530 return 0; 4531 4532 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4533 do_div(value, 1000000); 4534 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4535 } 4536 4537 static bool is_vmcs12_ext_field(unsigned long field) 4538 { 4539 switch (field) { 4540 case GUEST_ES_SELECTOR: 4541 case GUEST_CS_SELECTOR: 4542 case GUEST_SS_SELECTOR: 4543 case GUEST_DS_SELECTOR: 4544 case GUEST_FS_SELECTOR: 4545 case GUEST_GS_SELECTOR: 4546 case GUEST_LDTR_SELECTOR: 4547 case GUEST_TR_SELECTOR: 4548 case GUEST_ES_LIMIT: 4549 case GUEST_CS_LIMIT: 4550 case GUEST_SS_LIMIT: 4551 case GUEST_DS_LIMIT: 4552 case GUEST_FS_LIMIT: 4553 case GUEST_GS_LIMIT: 4554 case GUEST_LDTR_LIMIT: 4555 case GUEST_TR_LIMIT: 4556 case GUEST_GDTR_LIMIT: 4557 case GUEST_IDTR_LIMIT: 4558 case GUEST_ES_AR_BYTES: 4559 case GUEST_DS_AR_BYTES: 4560 case GUEST_FS_AR_BYTES: 4561 case GUEST_GS_AR_BYTES: 4562 case GUEST_LDTR_AR_BYTES: 4563 case GUEST_TR_AR_BYTES: 4564 case GUEST_ES_BASE: 4565 case GUEST_CS_BASE: 4566 case GUEST_SS_BASE: 4567 case GUEST_DS_BASE: 4568 case GUEST_FS_BASE: 4569 case GUEST_GS_BASE: 4570 case GUEST_LDTR_BASE: 4571 case GUEST_TR_BASE: 4572 case GUEST_GDTR_BASE: 4573 case GUEST_IDTR_BASE: 4574 case GUEST_PENDING_DBG_EXCEPTIONS: 4575 case GUEST_BNDCFGS: 4576 return true; 4577 default: 4578 break; 4579 } 4580 4581 return false; 4582 } 4583 4584 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4585 struct vmcs12 *vmcs12) 4586 { 4587 struct vcpu_vmx *vmx = to_vmx(vcpu); 4588 4589 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4590 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4591 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4592 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4593 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4594 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4595 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4596 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4597 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4598 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4599 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4600 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4601 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4602 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4603 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4604 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4605 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4606 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4607 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4608 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4609 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4610 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4611 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4612 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4613 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4614 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4615 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4616 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4617 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4618 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4619 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4620 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4621 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4622 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4623 vmcs12->guest_pending_dbg_exceptions = 4624 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4625 4626 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4627 } 4628 4629 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4630 struct vmcs12 *vmcs12) 4631 { 4632 struct vcpu_vmx *vmx = to_vmx(vcpu); 4633 int cpu; 4634 4635 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4636 return; 4637 4638 4639 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4640 4641 cpu = get_cpu(); 4642 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4643 vmx_vcpu_load_vmcs(vcpu, cpu); 4644 4645 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4646 4647 vmx->loaded_vmcs = &vmx->vmcs01; 4648 vmx_vcpu_load_vmcs(vcpu, cpu); 4649 put_cpu(); 4650 } 4651 4652 /* 4653 * Update the guest state fields of vmcs12 to reflect changes that 4654 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4655 * VM-entry controls is also updated, since this is really a guest 4656 * state bit.) 4657 */ 4658 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4659 { 4660 struct vcpu_vmx *vmx = to_vmx(vcpu); 4661 4662 if (nested_vmx_is_evmptr12_valid(vmx)) 4663 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4664 4665 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4666 !nested_vmx_is_evmptr12_valid(vmx); 4667 4668 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4669 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4670 4671 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4672 vmcs12->guest_rip = kvm_rip_read(vcpu); 4673 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4674 4675 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4676 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4677 4678 vmcs12->guest_interruptibility_info = 4679 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4680 4681 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4682 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4683 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4684 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4685 else 4686 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4687 4688 if (nested_cpu_has_preemption_timer(vmcs12) && 4689 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4690 !vcpu->arch.nested_run_pending) 4691 vmcs12->vmx_preemption_timer_value = 4692 vmx_get_preemption_timer_value(vcpu); 4693 4694 /* 4695 * In some cases (usually, nested EPT), L2 is allowed to change its 4696 * own CR3 without exiting. If it has changed it, we must keep it. 4697 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4698 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4699 * 4700 * Additionally, restore L2's PDPTR to vmcs12. 4701 */ 4702 if (enable_ept) { 4703 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4704 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4705 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4706 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4707 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4708 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4709 } 4710 } 4711 4712 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4713 4714 if (nested_cpu_has_vid(vmcs12)) 4715 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4716 4717 vmcs12->vm_entry_controls = 4718 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4719 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4720 4721 /* 4722 * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. 4723 * Writes to DEBUGCTL that aren't intercepted by L1 are immediately 4724 * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into 4725 * vmcs02 doesn't strictly track vmcs12. 4726 */ 4727 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4728 vmcs12->guest_dr7 = vcpu->arch.dr7; 4729 4730 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4731 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4732 4733 vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet, 4734 &vmcs12->guest_ssp, 4735 &vmcs12->guest_ssp_tbl); 4736 } 4737 4738 /* 4739 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4740 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4741 * and this function updates it to reflect the changes to the guest state while 4742 * L2 was running (and perhaps made some exits which were handled directly by L0 4743 * without going back to L1), and to reflect the exit reason. 4744 * Note that we do not have to copy here all VMCS fields, just those that 4745 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4746 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4747 * which already writes to vmcs12 directly. 4748 */ 4749 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4750 u32 vm_exit_reason, u32 exit_intr_info, 4751 unsigned long exit_qualification, u32 exit_insn_len) 4752 { 4753 /* update exit information fields: */ 4754 vmcs12->vm_exit_reason = vm_exit_reason; 4755 if (vmx_get_exit_reason(vcpu).enclave_mode) 4756 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4757 vmcs12->exit_qualification = exit_qualification; 4758 4759 /* 4760 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4761 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4762 * exit info fields are unmodified. 4763 */ 4764 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4765 vmcs12->launch_state = 1; 4766 4767 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4768 * instead of reading the real value. */ 4769 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4770 4771 /* 4772 * Transfer the event that L0 or L1 may wanted to inject into 4773 * L2 to IDT_VECTORING_INFO_FIELD. 4774 */ 4775 vmcs12_save_pending_event(vcpu, vmcs12, 4776 vm_exit_reason, exit_intr_info); 4777 4778 vmcs12->vm_exit_intr_info = exit_intr_info; 4779 vmcs12->vm_exit_instruction_len = exit_insn_len; 4780 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4781 4782 /* 4783 * According to spec, there's no need to store the guest's 4784 * MSRs if the exit is due to a VM-entry failure that occurs 4785 * during or after loading the guest state. Since this exit 4786 * does not fall in that category, we need to save the MSRs. 4787 */ 4788 if (nested_vmx_store_msr(vcpu, 4789 vmcs12->vm_exit_msr_store_addr, 4790 vmcs12->vm_exit_msr_store_count)) 4791 nested_vmx_abort(vcpu, 4792 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4793 } 4794 } 4795 4796 /* 4797 * A part of what we need to when the nested L2 guest exits and we want to 4798 * run its L1 parent, is to reset L1's guest state to the host state specified 4799 * in vmcs12. 4800 * This function is to be called not only on normal nested exit, but also on 4801 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4802 * Failures During or After Loading Guest State"). 4803 * This function should be called when the active VMCS is L1's (vmcs01). 4804 */ 4805 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4806 struct vmcs12 *vmcs12) 4807 { 4808 enum vm_entry_failure_code ignored; 4809 struct kvm_segment seg; 4810 4811 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4812 vcpu->arch.efer = vmcs12->host_ia32_efer; 4813 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4814 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4815 else 4816 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4817 vmx_set_efer(vcpu, vcpu->arch.efer); 4818 4819 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4820 kvm_rip_write(vcpu, vmcs12->host_rip); 4821 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4822 vmx_set_interrupt_shadow(vcpu, 0); 4823 4824 /* 4825 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4826 * actually changed, because vmx_set_cr0 refers to efer set above. 4827 * 4828 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4829 * (KVM doesn't change it); 4830 */ 4831 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4832 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4833 4834 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4835 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4836 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4837 4838 nested_ept_uninit_mmu_context(vcpu); 4839 4840 /* 4841 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4842 * couldn't have changed. 4843 */ 4844 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4845 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4846 4847 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4848 4849 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4850 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4851 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4852 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4853 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4854 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4855 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4856 4857 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4858 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4859 vmcs_write64(GUEST_BNDCFGS, 0); 4860 4861 /* 4862 * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set. 4863 * otherwise CET state should be retained across VM-exit, i.e., 4864 * guest values should be propagated from vmcs12 to vmcs01. 4865 */ 4866 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) 4867 vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, 4868 vmcs12->host_ssp_tbl); 4869 else 4870 vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, 4871 vmcs12->guest_ssp_tbl); 4872 4873 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4874 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4875 vcpu->arch.pat = vmcs12->host_ia32_pat; 4876 } 4877 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4878 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4879 WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4880 vmcs12->host_ia32_perf_global_ctrl)); 4881 4882 /* Set L1 segment info according to Intel SDM 4883 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4884 seg = (struct kvm_segment) { 4885 .base = 0, 4886 .limit = 0xFFFFFFFF, 4887 .selector = vmcs12->host_cs_selector, 4888 .type = 11, 4889 .present = 1, 4890 .s = 1, 4891 .g = 1 4892 }; 4893 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4894 seg.l = 1; 4895 else 4896 seg.db = 1; 4897 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4898 seg = (struct kvm_segment) { 4899 .base = 0, 4900 .limit = 0xFFFFFFFF, 4901 .type = 3, 4902 .present = 1, 4903 .s = 1, 4904 .db = 1, 4905 .g = 1 4906 }; 4907 seg.selector = vmcs12->host_ds_selector; 4908 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4909 seg.selector = vmcs12->host_es_selector; 4910 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4911 seg.selector = vmcs12->host_ss_selector; 4912 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4913 seg.selector = vmcs12->host_fs_selector; 4914 seg.base = vmcs12->host_fs_base; 4915 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4916 seg.selector = vmcs12->host_gs_selector; 4917 seg.base = vmcs12->host_gs_base; 4918 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4919 seg = (struct kvm_segment) { 4920 .base = vmcs12->host_tr_base, 4921 .limit = 0x67, 4922 .selector = vmcs12->host_tr_selector, 4923 .type = 11, 4924 .present = 1 4925 }; 4926 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4927 4928 memset(&seg, 0, sizeof(seg)); 4929 seg.unusable = 1; 4930 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4931 4932 kvm_set_dr(vcpu, 7, 0x400); 4933 vmx_guest_debugctl_write(vcpu, 0); 4934 4935 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4936 vmcs12->vm_exit_msr_load_count)) 4937 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4938 4939 to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4940 } 4941 4942 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4943 { 4944 struct vmx_uret_msr *efer_msr; 4945 unsigned int i; 4946 4947 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4948 return vmcs_read64(GUEST_IA32_EFER); 4949 4950 if (cpu_has_load_ia32_efer()) 4951 return kvm_host.efer; 4952 4953 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4954 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4955 return vmx->msr_autoload.guest.val[i].value; 4956 } 4957 4958 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4959 if (efer_msr) 4960 return efer_msr->data; 4961 4962 return kvm_host.efer; 4963 } 4964 4965 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4966 { 4967 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4968 struct vcpu_vmx *vmx = to_vmx(vcpu); 4969 struct vmx_msr_entry g, h; 4970 gpa_t gpa; 4971 u32 i, j; 4972 4973 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4974 4975 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4976 /* 4977 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4978 * as vmcs01.GUEST_DR7 contains a userspace defined value 4979 * and vcpu->arch.dr7 is not squirreled away before the 4980 * nested VMENTER (not worth adding a variable in nested_vmx). 4981 */ 4982 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4983 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4984 else 4985 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4986 } 4987 4988 /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ 4989 vmx_reload_guest_debugctl(vcpu); 4990 4991 /* 4992 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4993 * handle a variety of side effects to KVM's software model. 4994 */ 4995 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4996 4997 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4998 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4999 5000 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 5001 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 5002 5003 nested_ept_uninit_mmu_context(vcpu); 5004 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 5005 kvm_register_mark_available(vcpu, VCPU_REG_CR3); 5006 5007 /* 5008 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 5009 * from vmcs01 (if necessary). The PDPTRs are not loaded on 5010 * VMFail, like everything else we just need to ensure our 5011 * software model is up-to-date. 5012 */ 5013 if (enable_ept && is_pae_paging(vcpu)) 5014 ept_save_pdptrs(vcpu); 5015 5016 kvm_mmu_reset_context(vcpu); 5017 5018 /* 5019 * This nasty bit of open coding is a compromise between blindly 5020 * loading L1's MSRs using the exit load lists (incorrect emulation 5021 * of VMFail), leaving the nested VM's MSRs in the software model 5022 * (incorrect behavior) and snapshotting the modified MSRs (too 5023 * expensive since the lists are unbound by hardware). For each 5024 * MSR that was (prematurely) loaded from the nested VMEntry load 5025 * list, reload it from the exit load list if it exists and differs 5026 * from the guest value. The intent is to stuff host state as 5027 * silently as possible, not to fully process the exit load list. 5028 */ 5029 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 5030 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 5031 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 5032 pr_debug_ratelimited( 5033 "%s read MSR index failed (%u, 0x%08llx)\n", 5034 __func__, i, gpa); 5035 goto vmabort; 5036 } 5037 5038 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 5039 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 5040 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 5041 pr_debug_ratelimited( 5042 "%s read MSR failed (%u, 0x%08llx)\n", 5043 __func__, j, gpa); 5044 goto vmabort; 5045 } 5046 if (h.index != g.index) 5047 continue; 5048 if (h.value == g.value) 5049 break; 5050 5051 if (nested_vmx_load_msr_check(vcpu, &h)) { 5052 pr_debug_ratelimited( 5053 "%s check failed (%u, 0x%x, 0x%x)\n", 5054 __func__, j, h.index, h.reserved); 5055 goto vmabort; 5056 } 5057 5058 if (kvm_emulate_msr_write(vcpu, h.index, h.value)) { 5059 pr_debug_ratelimited( 5060 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 5061 __func__, j, h.index, h.value); 5062 goto vmabort; 5063 } 5064 } 5065 } 5066 5067 return; 5068 5069 vmabort: 5070 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 5071 } 5072 5073 /* 5074 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 5075 * and modify vmcs12 to make it see what it would expect to see there if 5076 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 5077 */ 5078 void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 5079 u32 exit_intr_info, unsigned long exit_qualification, 5080 u32 exit_insn_len) 5081 { 5082 struct vcpu_vmx *vmx = to_vmx(vcpu); 5083 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5084 5085 /* Pending MTF traps are discarded on VM-Exit. */ 5086 vmx->nested.mtf_pending = false; 5087 5088 /* trying to cancel vmlaunch/vmresume is a bug */ 5089 kvm_warn_on_nested_run_pending(vcpu); 5090 5091 #ifdef CONFIG_KVM_HYPERV 5092 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 5093 /* 5094 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 5095 * Enlightened VMCS after migration and we still need to 5096 * do that when something is forcing L2->L1 exit prior to 5097 * the first L2 run. 5098 */ 5099 (void)nested_get_evmcs_page(vcpu); 5100 } 5101 #endif 5102 5103 /* Service pending TLB flush requests for L2 before switching to L1. */ 5104 kvm_service_local_tlb_flush_requests(vcpu); 5105 5106 /* 5107 * VCPU_REG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 5108 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 5109 * up-to-date before switching to L1. 5110 */ 5111 if (enable_ept && is_pae_paging(vcpu)) 5112 vmx_ept_load_pdptrs(vcpu); 5113 5114 leave_guest_mode(vcpu); 5115 5116 if (nested_cpu_has_preemption_timer(vmcs12)) 5117 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 5118 5119 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 5120 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 5121 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 5122 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 5123 } 5124 5125 if (likely(!vmx->fail)) { 5126 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 5127 5128 if (vm_exit_reason != -1) 5129 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 5130 exit_intr_info, exit_qualification, 5131 exit_insn_len); 5132 5133 /* 5134 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 5135 * also be used to capture vmcs12 cache as part of 5136 * capturing nVMX state for snapshot (migration). 5137 * 5138 * Otherwise, this flush will dirty guest memory at a 5139 * point it is already assumed by user-space to be 5140 * immutable. 5141 */ 5142 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 5143 } else { 5144 /* 5145 * The only expected VM-instruction error is "VM entry with 5146 * invalid control field(s)." Anything else indicates a 5147 * problem with L0. 5148 */ 5149 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 5150 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5151 5152 /* VM-Fail at VM-Entry means KVM missed a consistency check. */ 5153 WARN_ON_ONCE(warn_on_missed_cc); 5154 } 5155 5156 /* 5157 * Drop events/exceptions that were queued for re-injection to L2 5158 * (picked up via vmx_complete_interrupts()), as well as exceptions 5159 * that were pending for L2. Note, this must NOT be hoisted above 5160 * prepare_vmcs12(), events/exceptions queued for re-injection need to 5161 * be captured in vmcs12 (see vmcs12_save_pending_event()). 5162 */ 5163 vcpu->arch.nmi_injected = false; 5164 kvm_clear_exception_queue(vcpu); 5165 kvm_clear_interrupt_queue(vcpu); 5166 5167 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 5168 5169 kvm_nested_vmexit_handle_ibrs(vcpu); 5170 5171 /* 5172 * Update any VMCS fields that might have changed while vmcs02 was the 5173 * active VMCS. The tracking is per-vCPU, not per-VMCS. 5174 */ 5175 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); 5176 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 5177 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 5178 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 5179 if (kvm_caps.has_tsc_control) 5180 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 5181 5182 nested_put_vmcs12_pages(vcpu); 5183 5184 if ((vm_exit_reason != -1) && 5185 (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) 5186 vmx->nested.need_vmcs12_to_shadow_sync = true; 5187 5188 /* in case we halted in L2 */ 5189 kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 5190 5191 if (likely(!vmx->fail)) { 5192 if (vm_exit_reason != -1) 5193 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 5194 vmcs12->exit_qualification, 5195 vmcs12->idt_vectoring_info_field, 5196 vmcs12->vm_exit_intr_info, 5197 vmcs12->vm_exit_intr_error_code, 5198 KVM_ISA_VMX); 5199 5200 load_vmcs12_host_state(vcpu, vmcs12); 5201 5202 /* 5203 * Process events if an injectable IRQ or NMI is pending, even 5204 * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). 5205 * If an event became pending while L2 was active, KVM needs to 5206 * either inject the event or request an IRQ/NMI window. SMIs 5207 * don't need to be processed as SMM is mutually exclusive with 5208 * non-root mode. INIT/SIPI don't need to be checked as INIT 5209 * is blocked post-VMXON, and SIPIs are ignored. 5210 */ 5211 if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) 5212 kvm_make_request(KVM_REQ_EVENT, vcpu); 5213 return; 5214 } 5215 5216 /* 5217 * After an early L2 VM-entry failure, we're now back 5218 * in L1 which thinks it just finished a VMLAUNCH or 5219 * VMRESUME instruction, so we need to set the failure 5220 * flag and the VM-instruction error field of the VMCS 5221 * accordingly, and skip the emulated instruction. 5222 */ 5223 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5224 5225 /* 5226 * Restore L1's host state to KVM's software model. We're here 5227 * because a consistency check was caught by hardware, which 5228 * means some amount of guest state has been propagated to KVM's 5229 * model and needs to be unwound to the host's state. 5230 */ 5231 nested_vmx_restore_host_state(vcpu); 5232 5233 vmx->fail = 0; 5234 } 5235 5236 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 5237 { 5238 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 5239 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 5240 } 5241 5242 /* 5243 * Decode the memory-address operand of a vmx instruction, as recorded on an 5244 * exit caused by such an instruction (run by a guest hypervisor). 5245 * On success, returns 0. When the operand is invalid, returns 1 and throws 5246 * #UD, #GP, or #SS. 5247 */ 5248 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 5249 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 5250 { 5251 gva_t off; 5252 bool exn; 5253 struct kvm_segment s; 5254 5255 /* 5256 * According to Vol. 3B, "Information for VM Exits Due to Instruction 5257 * Execution", on an exit, vmx_instruction_info holds most of the 5258 * addressing components of the operand. Only the displacement part 5259 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 5260 * For how an actual address is calculated from all these components, 5261 * refer to Vol. 1, "Operand Addressing". 5262 */ 5263 int scaling = vmx_instruction_info & 3; 5264 int addr_size = (vmx_instruction_info >> 7) & 7; 5265 bool is_reg = vmx_instruction_info & (1u << 10); 5266 int seg_reg = (vmx_instruction_info >> 15) & 7; 5267 int index_reg = (vmx_instruction_info >> 18) & 0xf; 5268 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 5269 int base_reg = (vmx_instruction_info >> 23) & 0xf; 5270 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 5271 5272 if (is_reg) { 5273 kvm_queue_exception(vcpu, UD_VECTOR); 5274 return 1; 5275 } 5276 5277 /* Addr = segment_base + offset */ 5278 /* offset = base + [index * scale] + displacement */ 5279 off = exit_qualification; /* holds the displacement */ 5280 if (addr_size == 1) 5281 off = (gva_t)sign_extend64(off, 31); 5282 else if (addr_size == 0) 5283 off = (gva_t)sign_extend64(off, 15); 5284 if (base_is_valid) 5285 off += kvm_register_read(vcpu, base_reg); 5286 if (index_is_valid) 5287 off += kvm_register_read(vcpu, index_reg) << scaling; 5288 vmx_get_segment(vcpu, &s, seg_reg); 5289 5290 /* 5291 * The effective address, i.e. @off, of a memory operand is truncated 5292 * based on the address size of the instruction. Note that this is 5293 * the *effective address*, i.e. the address prior to accounting for 5294 * the segment's base. 5295 */ 5296 if (addr_size == 1) /* 32 bit */ 5297 off &= 0xffffffff; 5298 else if (addr_size == 0) /* 16 bit */ 5299 off &= 0xffff; 5300 5301 /* Checks for #GP/#SS exceptions. */ 5302 exn = false; 5303 if (is_long_mode(vcpu)) { 5304 /* 5305 * The virtual/linear address is never truncated in 64-bit 5306 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 5307 * address when using FS/GS with a non-zero base. 5308 */ 5309 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 5310 *ret = s.base + off; 5311 else 5312 *ret = off; 5313 5314 *ret = vmx_get_untagged_addr(vcpu, *ret, 0); 5315 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 5316 * non-canonical form. This is the only check on the memory 5317 * destination for long mode! 5318 */ 5319 exn = is_noncanonical_address(*ret, vcpu, 0); 5320 } else { 5321 /* 5322 * When not in long mode, the virtual/linear address is 5323 * unconditionally truncated to 32 bits regardless of the 5324 * address size. 5325 */ 5326 *ret = (s.base + off) & 0xffffffff; 5327 5328 /* Protected mode: apply checks for segment validity in the 5329 * following order: 5330 * - segment type check (#GP(0) may be thrown) 5331 * - usability check (#GP(0)/#SS(0)) 5332 * - limit check (#GP(0)/#SS(0)) 5333 */ 5334 if (wr) 5335 /* #GP(0) if the destination operand is located in a 5336 * read-only data segment or any code segment. 5337 */ 5338 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5339 else 5340 /* #GP(0) if the source operand is located in an 5341 * execute-only code segment 5342 */ 5343 exn = ((s.type & 0xa) == 8); 5344 if (exn) { 5345 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5346 return 1; 5347 } 5348 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5349 */ 5350 exn = (s.unusable != 0); 5351 5352 /* 5353 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5354 * outside the segment limit. All CPUs that support VMX ignore 5355 * limit checks for flat segments, i.e. segments with base==0, 5356 * limit==0xffffffff and of type expand-up data or code. 5357 */ 5358 if (!(s.base == 0 && s.limit == 0xffffffff && 5359 ((s.type & 8) || !(s.type & 4)))) 5360 exn = exn || ((u64)off + len - 1 > s.limit); 5361 } 5362 if (exn) { 5363 kvm_queue_exception_e(vcpu, 5364 seg_reg == VCPU_SREG_SS ? 5365 SS_VECTOR : GP_VECTOR, 5366 0); 5367 return 1; 5368 } 5369 5370 return 0; 5371 } 5372 5373 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5374 int *ret) 5375 { 5376 gva_t gva; 5377 struct x86_exception e; 5378 int r; 5379 5380 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5381 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5382 sizeof(*vmpointer), &gva)) { 5383 *ret = 1; 5384 return -EINVAL; 5385 } 5386 5387 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5388 if (r != X86EMUL_CONTINUE) { 5389 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5390 return -EINVAL; 5391 } 5392 5393 return 0; 5394 } 5395 5396 /* 5397 * Allocate a shadow VMCS and associate it with the currently loaded 5398 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5399 * VMCS is also VMCLEARed, so that it is ready for use. 5400 */ 5401 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5402 { 5403 struct vcpu_vmx *vmx = to_vmx(vcpu); 5404 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5405 5406 /* 5407 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5408 * when L1 executes VMXOFF or the vCPU is forced out of nested 5409 * operation. VMXON faults if the CPU is already post-VMXON, so it 5410 * should be impossible to already have an allocated shadow VMCS. KVM 5411 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5412 * always be the loaded VMCS. 5413 */ 5414 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5415 return loaded_vmcs->shadow_vmcs; 5416 5417 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5418 if (loaded_vmcs->shadow_vmcs) 5419 vmcs_clear(loaded_vmcs->shadow_vmcs); 5420 5421 return loaded_vmcs->shadow_vmcs; 5422 } 5423 5424 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5425 { 5426 struct vcpu_vmx *vmx = to_vmx(vcpu); 5427 int r; 5428 5429 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5430 if (r < 0) 5431 goto out_vmcs02; 5432 5433 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5434 if (!vmx->nested.cached_vmcs12) 5435 goto out_cached_vmcs12; 5436 5437 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5438 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5439 if (!vmx->nested.cached_shadow_vmcs12) 5440 goto out_cached_shadow_vmcs12; 5441 5442 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5443 goto out_shadow_vmcs; 5444 5445 hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, 5446 HRTIMER_MODE_ABS_PINNED); 5447 5448 vmx->nested.vpid02 = allocate_vpid(); 5449 5450 vmx->nested.vmcs02_initialized = false; 5451 vmx->nested.vmxon = true; 5452 5453 if (vmx_pt_mode_is_host_guest()) { 5454 vmx->pt_desc.guest.ctl = 0; 5455 pt_update_intercept_for_msr(vcpu); 5456 } 5457 5458 return 0; 5459 5460 out_shadow_vmcs: 5461 kfree(vmx->nested.cached_shadow_vmcs12); 5462 5463 out_cached_shadow_vmcs12: 5464 kfree(vmx->nested.cached_vmcs12); 5465 5466 out_cached_vmcs12: 5467 free_loaded_vmcs(&vmx->nested.vmcs02); 5468 5469 out_vmcs02: 5470 return -ENOMEM; 5471 } 5472 5473 /* Emulate the VMXON instruction. */ 5474 static int handle_vmxon(struct kvm_vcpu *vcpu) 5475 { 5476 int ret; 5477 gpa_t vmptr; 5478 uint32_t revision; 5479 struct vcpu_vmx *vmx = to_vmx(vcpu); 5480 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5481 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5482 5483 /* 5484 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5485 * the guest and so cannot rely on hardware to perform the check, 5486 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5487 * for VMXON). 5488 * 5489 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5490 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5491 * force any of the relevant guest state. For a restricted guest, KVM 5492 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5493 * Real Mode, and so there's no need to check CR0.PE manually. 5494 */ 5495 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5496 kvm_queue_exception(vcpu, UD_VECTOR); 5497 return 1; 5498 } 5499 5500 /* 5501 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5502 * and has higher priority than the VM-Fail due to being post-VMXON, 5503 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5504 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5505 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5506 * VMX non-root. 5507 * 5508 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5509 * #UD checks (see above), is functionally ok because KVM doesn't allow 5510 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5511 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5512 * missed by hardware due to shadowing CR0 and/or CR4. 5513 */ 5514 if (vmx_get_cpl(vcpu)) { 5515 kvm_inject_gp(vcpu, 0); 5516 return 1; 5517 } 5518 5519 if (vmx->nested.vmxon) 5520 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5521 5522 /* 5523 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5524 * only if the vCPU isn't already in VMX operation, i.e. effectively 5525 * have lower priority than the VM-Fail above. 5526 */ 5527 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5528 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5529 kvm_inject_gp(vcpu, 0); 5530 return 1; 5531 } 5532 5533 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5534 != VMXON_NEEDED_FEATURES) { 5535 kvm_inject_gp(vcpu, 0); 5536 return 1; 5537 } 5538 5539 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5540 return ret; 5541 5542 /* 5543 * SDM 3: 24.11.5 5544 * The first 4 bytes of VMXON region contain the supported 5545 * VMCS revision identifier 5546 * 5547 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5548 * which replaces physical address width with 32 5549 */ 5550 if (!page_address_valid(vcpu, vmptr)) 5551 return nested_vmx_failInvalid(vcpu); 5552 5553 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5554 revision != VMCS12_REVISION) 5555 return nested_vmx_failInvalid(vcpu); 5556 5557 vmx->nested.vmxon_ptr = vmptr; 5558 ret = enter_vmx_operation(vcpu); 5559 if (ret) 5560 return ret; 5561 5562 return nested_vmx_succeed(vcpu); 5563 } 5564 5565 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5566 { 5567 struct vcpu_vmx *vmx = to_vmx(vcpu); 5568 5569 if (vmx->nested.current_vmptr == INVALID_GPA) 5570 return; 5571 5572 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5573 5574 if (enable_shadow_vmcs) { 5575 /* copy to memory all shadowed fields in case 5576 they were modified */ 5577 copy_shadow_to_vmcs12(vmx); 5578 vmx_disable_shadow_vmcs(vmx); 5579 } 5580 vmx->nested.posted_intr_nv = -1; 5581 5582 /* Flush VMCS12 to guest memory */ 5583 kvm_vcpu_write_guest_page(vcpu, 5584 vmx->nested.current_vmptr >> PAGE_SHIFT, 5585 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5586 5587 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5588 5589 vmx->nested.current_vmptr = INVALID_GPA; 5590 } 5591 5592 /* Emulate the VMXOFF instruction */ 5593 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5594 { 5595 if (!nested_vmx_check_permission(vcpu)) 5596 return 1; 5597 5598 free_nested(vcpu); 5599 5600 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5601 kvm_make_request(KVM_REQ_EVENT, vcpu); 5602 5603 return nested_vmx_succeed(vcpu); 5604 } 5605 5606 /* Emulate the VMCLEAR instruction */ 5607 static int handle_vmclear(struct kvm_vcpu *vcpu) 5608 { 5609 struct vcpu_vmx *vmx = to_vmx(vcpu); 5610 u32 zero = 0; 5611 gpa_t vmptr; 5612 int r; 5613 5614 if (!nested_vmx_check_permission(vcpu)) 5615 return 1; 5616 5617 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5618 return r; 5619 5620 if (!page_address_valid(vcpu, vmptr)) 5621 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5622 5623 if (vmptr == vmx->nested.vmxon_ptr) 5624 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5625 5626 if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { 5627 if (vmptr == vmx->nested.current_vmptr) 5628 nested_release_vmcs12(vcpu); 5629 5630 /* 5631 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5632 * for VMCLEAR includes a "ensure that data for VMCS referenced 5633 * by the operand is in memory" clause that guards writes to 5634 * memory, i.e. doing nothing for I/O is architecturally valid. 5635 * 5636 * FIXME: Suppress failures if and only if no memslot is found, 5637 * i.e. exit to userspace if __copy_to_user() fails. 5638 */ 5639 (void)kvm_vcpu_write_guest(vcpu, 5640 vmptr + offsetof(struct vmcs12, 5641 launch_state), 5642 &zero, sizeof(zero)); 5643 } 5644 5645 return nested_vmx_succeed(vcpu); 5646 } 5647 5648 /* Emulate the VMLAUNCH instruction */ 5649 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5650 { 5651 return nested_vmx_run(vcpu, true); 5652 } 5653 5654 /* Emulate the VMRESUME instruction */ 5655 static int handle_vmresume(struct kvm_vcpu *vcpu) 5656 { 5657 5658 return nested_vmx_run(vcpu, false); 5659 } 5660 5661 static int handle_vmread(struct kvm_vcpu *vcpu) 5662 { 5663 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5664 : get_vmcs12(vcpu); 5665 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5666 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5667 struct vcpu_vmx *vmx = to_vmx(vcpu); 5668 struct x86_exception e; 5669 unsigned long field; 5670 u64 value; 5671 gva_t gva = 0; 5672 short offset; 5673 int len, r; 5674 5675 if (!nested_vmx_check_permission(vcpu)) 5676 return 1; 5677 5678 /* Decode instruction info and find the field to read */ 5679 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5680 5681 if (!nested_vmx_is_evmptr12_valid(vmx)) { 5682 /* 5683 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5684 * any VMREAD sets the ALU flags for VMfailInvalid. 5685 */ 5686 if (vmx->nested.current_vmptr == INVALID_GPA || 5687 (is_guest_mode(vcpu) && 5688 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5689 return nested_vmx_failInvalid(vcpu); 5690 5691 offset = get_vmcs12_field_offset(field); 5692 if (offset < 0) 5693 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5694 5695 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5696 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5697 5698 /* Read the field, zero-extended to a u64 value */ 5699 value = vmcs12_read_any(vmcs12, field, offset); 5700 } else { 5701 /* 5702 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5703 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5704 * unsupported. Unfortunately, certain versions of Windows 11 5705 * don't comply with this requirement which is not enforced in 5706 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5707 * workaround, as misbehaving guests will panic on VM-Fail. 5708 * Note, enlightened VMCS is incompatible with shadow VMCS so 5709 * all VMREADs from L2 should go to L1. 5710 */ 5711 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5712 return nested_vmx_failInvalid(vcpu); 5713 5714 offset = evmcs_field_offset(field, NULL); 5715 if (offset < 0) 5716 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5717 5718 /* Read the field, zero-extended to a u64 value */ 5719 value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); 5720 } 5721 5722 /* 5723 * Now copy part of this value to register or memory, as requested. 5724 * Note that the number of bits actually copied is 32 or 64 depending 5725 * on the guest's mode (32 or 64 bit), not on the given field's length. 5726 */ 5727 if (instr_info & BIT(10)) { 5728 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5729 } else { 5730 len = is_64_bit_mode(vcpu) ? 8 : 4; 5731 if (get_vmx_mem_address(vcpu, exit_qualification, 5732 instr_info, true, len, &gva)) 5733 return 1; 5734 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5735 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5736 if (r != X86EMUL_CONTINUE) 5737 return kvm_handle_memory_failure(vcpu, r, &e); 5738 } 5739 5740 return nested_vmx_succeed(vcpu); 5741 } 5742 5743 static bool is_shadow_field_rw(unsigned long field) 5744 { 5745 switch (field) { 5746 #define SHADOW_FIELD_RW(x, y) case x: 5747 #include "vmcs_shadow_fields.h" 5748 return true; 5749 default: 5750 break; 5751 } 5752 return false; 5753 } 5754 5755 static bool is_shadow_field_ro(unsigned long field) 5756 { 5757 switch (field) { 5758 #define SHADOW_FIELD_RO(x, y) case x: 5759 #include "vmcs_shadow_fields.h" 5760 return true; 5761 default: 5762 break; 5763 } 5764 return false; 5765 } 5766 5767 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5768 { 5769 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5770 : get_vmcs12(vcpu); 5771 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5772 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5773 struct vcpu_vmx *vmx = to_vmx(vcpu); 5774 struct x86_exception e; 5775 unsigned long field; 5776 short offset; 5777 gva_t gva; 5778 int len, r; 5779 5780 /* 5781 * The value to write might be 32 or 64 bits, depending on L1's long 5782 * mode, and eventually we need to write that into a field of several 5783 * possible lengths. The code below first zero-extends the value to 64 5784 * bit (value), and then copies only the appropriate number of 5785 * bits into the vmcs12 field. 5786 */ 5787 u64 value = 0; 5788 5789 if (!nested_vmx_check_permission(vcpu)) 5790 return 1; 5791 5792 /* 5793 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5794 * any VMWRITE sets the ALU flags for VMfailInvalid. 5795 */ 5796 if (vmx->nested.current_vmptr == INVALID_GPA || 5797 (is_guest_mode(vcpu) && 5798 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5799 return nested_vmx_failInvalid(vcpu); 5800 5801 if (instr_info & BIT(10)) 5802 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5803 else { 5804 len = is_64_bit_mode(vcpu) ? 8 : 4; 5805 if (get_vmx_mem_address(vcpu, exit_qualification, 5806 instr_info, false, len, &gva)) 5807 return 1; 5808 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5809 if (r != X86EMUL_CONTINUE) 5810 return kvm_handle_memory_failure(vcpu, r, &e); 5811 } 5812 5813 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5814 5815 offset = get_vmcs12_field_offset(field); 5816 if (offset < 0) 5817 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5818 5819 /* 5820 * If the vCPU supports "VMWRITE to any supported field in the 5821 * VMCS," then the "read-only" fields are actually read/write. 5822 */ 5823 if (vmcs_field_readonly(field) && 5824 !nested_cpu_has_vmwrite_any_field(vcpu)) 5825 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5826 5827 /* 5828 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5829 * vmcs12, else we may crush a field or consume a stale value. 5830 */ 5831 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5832 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5833 5834 /* 5835 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5836 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5837 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5838 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5839 * from L1 will return a different value than VMREAD from L2 (L1 sees 5840 * the stripped down value, L2 sees the full value as stored by KVM). 5841 */ 5842 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5843 value &= 0x1f0ff; 5844 5845 vmcs12_write_any(vmcs12, field, offset, value); 5846 5847 /* 5848 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5849 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5850 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5851 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5852 */ 5853 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5854 /* 5855 * L1 can read these fields without exiting, ensure the 5856 * shadow VMCS is up-to-date. 5857 */ 5858 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5859 preempt_disable(); 5860 vmcs_load(vmx->vmcs01.shadow_vmcs); 5861 5862 __vmcs_writel(field, value); 5863 5864 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5865 vmcs_load(vmx->loaded_vmcs->vmcs); 5866 preempt_enable(); 5867 } 5868 vmx->nested.dirty_vmcs12 = true; 5869 } 5870 5871 return nested_vmx_succeed(vcpu); 5872 } 5873 5874 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5875 { 5876 vmx->nested.current_vmptr = vmptr; 5877 if (enable_shadow_vmcs) { 5878 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5879 vmcs_write64(VMCS_LINK_POINTER, 5880 __pa(vmx->vmcs01.shadow_vmcs)); 5881 vmx->nested.need_vmcs12_to_shadow_sync = true; 5882 } 5883 vmx->nested.dirty_vmcs12 = true; 5884 vmx->nested.force_msr_bitmap_recalc = true; 5885 } 5886 5887 /* Emulate the VMPTRLD instruction */ 5888 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5889 { 5890 struct vcpu_vmx *vmx = to_vmx(vcpu); 5891 gpa_t vmptr; 5892 int r; 5893 5894 if (!nested_vmx_check_permission(vcpu)) 5895 return 1; 5896 5897 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5898 return r; 5899 5900 if (!page_address_valid(vcpu, vmptr)) 5901 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5902 5903 if (vmptr == vmx->nested.vmxon_ptr) 5904 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5905 5906 /* Forbid normal VMPTRLD if Enlightened version was used */ 5907 if (nested_vmx_is_evmptr12_valid(vmx)) 5908 return 1; 5909 5910 if (vmx->nested.current_vmptr != vmptr) { 5911 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5912 struct vmcs_hdr hdr; 5913 5914 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5915 /* 5916 * Reads from an unbacked page return all 1s, 5917 * which means that the 32 bits located at the 5918 * given physical address won't match the required 5919 * VMCS12_REVISION identifier. 5920 */ 5921 return nested_vmx_fail(vcpu, 5922 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5923 } 5924 5925 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5926 offsetof(struct vmcs12, hdr), 5927 sizeof(hdr))) { 5928 return nested_vmx_fail(vcpu, 5929 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5930 } 5931 5932 if (hdr.revision_id != VMCS12_REVISION || 5933 (hdr.shadow_vmcs && 5934 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5935 return nested_vmx_fail(vcpu, 5936 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5937 } 5938 5939 nested_release_vmcs12(vcpu); 5940 5941 /* 5942 * Load VMCS12 from guest memory since it is not already 5943 * cached. 5944 */ 5945 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5946 VMCS12_SIZE)) { 5947 return nested_vmx_fail(vcpu, 5948 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5949 } 5950 5951 set_current_vmptr(vmx, vmptr); 5952 } 5953 5954 return nested_vmx_succeed(vcpu); 5955 } 5956 5957 /* Emulate the VMPTRST instruction */ 5958 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5959 { 5960 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5961 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5962 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5963 struct x86_exception e; 5964 gva_t gva; 5965 int r; 5966 5967 if (!nested_vmx_check_permission(vcpu)) 5968 return 1; 5969 5970 if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) 5971 return 1; 5972 5973 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5974 true, sizeof(gpa_t), &gva)) 5975 return 1; 5976 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5977 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5978 sizeof(gpa_t), &e); 5979 if (r != X86EMUL_CONTINUE) 5980 return kvm_handle_memory_failure(vcpu, r, &e); 5981 5982 return nested_vmx_succeed(vcpu); 5983 } 5984 5985 /* Emulate the INVEPT instruction */ 5986 static int handle_invept(struct kvm_vcpu *vcpu) 5987 { 5988 struct vcpu_vmx *vmx = to_vmx(vcpu); 5989 u32 vmx_instruction_info, types; 5990 unsigned long type, roots_to_free; 5991 struct kvm_mmu *mmu; 5992 gva_t gva; 5993 struct x86_exception e; 5994 struct { 5995 u64 eptp, gpa; 5996 } operand; 5997 int i, r, gpr_index; 5998 5999 if (!(vmx->nested.msrs.secondary_ctls_high & 6000 SECONDARY_EXEC_ENABLE_EPT) || 6001 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 6002 kvm_queue_exception(vcpu, UD_VECTOR); 6003 return 1; 6004 } 6005 6006 if (!nested_vmx_check_permission(vcpu)) 6007 return 1; 6008 6009 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6010 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 6011 type = kvm_register_read(vcpu, gpr_index); 6012 6013 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 6014 6015 if (type >= 32 || !(types & (1 << type))) 6016 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6017 6018 /* According to the Intel VMX instruction reference, the memory 6019 * operand is read even if it isn't needed (e.g., for type==global) 6020 */ 6021 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6022 vmx_instruction_info, false, sizeof(operand), &gva)) 6023 return 1; 6024 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 6025 if (r != X86EMUL_CONTINUE) 6026 return kvm_handle_memory_failure(vcpu, r, &e); 6027 6028 /* 6029 * Nested EPT roots are always held through guest_mmu, 6030 * not root_mmu. 6031 */ 6032 mmu = &vcpu->arch.guest_mmu; 6033 6034 switch (type) { 6035 case VMX_EPT_EXTENT_CONTEXT: 6036 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 6037 return nested_vmx_fail(vcpu, 6038 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6039 6040 roots_to_free = 0; 6041 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 6042 operand.eptp)) 6043 roots_to_free |= KVM_MMU_ROOT_CURRENT; 6044 6045 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 6046 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 6047 mmu->prev_roots[i].pgd, 6048 operand.eptp)) 6049 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 6050 } 6051 break; 6052 case VMX_EPT_EXTENT_GLOBAL: 6053 roots_to_free = KVM_MMU_ROOTS_ALL; 6054 break; 6055 default: 6056 BUG(); 6057 break; 6058 } 6059 6060 if (roots_to_free) 6061 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 6062 6063 return nested_vmx_succeed(vcpu); 6064 } 6065 6066 static int handle_invvpid(struct kvm_vcpu *vcpu) 6067 { 6068 struct vcpu_vmx *vmx = to_vmx(vcpu); 6069 u32 vmx_instruction_info; 6070 unsigned long type, types; 6071 gva_t gva; 6072 struct x86_exception e; 6073 struct { 6074 u64 vpid; 6075 u64 gla; 6076 } operand; 6077 u16 vpid02; 6078 int r, gpr_index; 6079 6080 if (!(vmx->nested.msrs.secondary_ctls_high & 6081 SECONDARY_EXEC_ENABLE_VPID) || 6082 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 6083 kvm_queue_exception(vcpu, UD_VECTOR); 6084 return 1; 6085 } 6086 6087 if (!nested_vmx_check_permission(vcpu)) 6088 return 1; 6089 6090 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6091 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 6092 type = kvm_register_read(vcpu, gpr_index); 6093 6094 types = (vmx->nested.msrs.vpid_caps & 6095 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 6096 6097 if (type >= 32 || !(types & (1 << type))) 6098 return nested_vmx_fail(vcpu, 6099 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6100 6101 /* according to the intel vmx instruction reference, the memory 6102 * operand is read even if it isn't needed (e.g., for type==global) 6103 */ 6104 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6105 vmx_instruction_info, false, sizeof(operand), &gva)) 6106 return 1; 6107 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 6108 if (r != X86EMUL_CONTINUE) 6109 return kvm_handle_memory_failure(vcpu, r, &e); 6110 6111 if (operand.vpid >> 16) 6112 return nested_vmx_fail(vcpu, 6113 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6114 6115 /* 6116 * Always flush the effective vpid02, i.e. never flush the current VPID 6117 * and never explicitly flush vpid01. INVVPID targets a VPID, not a 6118 * VMCS, and so whether or not the current vmcs12 has VPID enabled is 6119 * irrelevant (and there may not be a loaded vmcs12). 6120 */ 6121 vpid02 = nested_get_vpid02(vcpu); 6122 switch (type) { 6123 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 6124 /* 6125 * LAM doesn't apply to addresses that are inputs to TLB 6126 * invalidation. 6127 */ 6128 if (!operand.vpid || 6129 is_noncanonical_invlpg_address(operand.gla, vcpu)) 6130 return nested_vmx_fail(vcpu, 6131 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6132 vpid_sync_vcpu_addr(vpid02, operand.gla); 6133 break; 6134 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 6135 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 6136 if (!operand.vpid) 6137 return nested_vmx_fail(vcpu, 6138 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6139 vpid_sync_context(vpid02); 6140 break; 6141 case VMX_VPID_EXTENT_ALL_CONTEXT: 6142 vpid_sync_context(vpid02); 6143 break; 6144 default: 6145 WARN_ON_ONCE(1); 6146 return kvm_skip_emulated_instruction(vcpu); 6147 } 6148 6149 /* 6150 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 6151 * linear mappings for L2 (tagged with L2's VPID). Free all guest 6152 * roots as VPIDs are not tracked in the MMU role. 6153 * 6154 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 6155 * an MMU when EPT is disabled. 6156 * 6157 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 6158 */ 6159 if (!enable_ept) 6160 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 6161 6162 return nested_vmx_succeed(vcpu); 6163 } 6164 6165 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 6166 struct vmcs12 *vmcs12) 6167 { 6168 u32 index = kvm_rcx_read(vcpu); 6169 u64 new_eptp; 6170 6171 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 6172 return 1; 6173 if (index >= VMFUNC_EPTP_ENTRIES) 6174 return 1; 6175 6176 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 6177 &new_eptp, index * 8, 8)) 6178 return 1; 6179 6180 /* 6181 * If the (L2) guest does a vmfunc to the currently 6182 * active ept pointer, we don't have to do anything else 6183 */ 6184 if (vmcs12->ept_pointer != new_eptp) { 6185 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 6186 return 1; 6187 6188 vmcs12->ept_pointer = new_eptp; 6189 nested_ept_new_eptp(vcpu); 6190 6191 if (!nested_cpu_has_vpid(vmcs12)) 6192 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 6193 } 6194 6195 return 0; 6196 } 6197 6198 static int handle_vmfunc(struct kvm_vcpu *vcpu) 6199 { 6200 struct vcpu_vmx *vmx = to_vmx(vcpu); 6201 struct vmcs12 *vmcs12; 6202 u32 function = kvm_rax_read(vcpu); 6203 6204 /* 6205 * VMFUNC should never execute cleanly while L1 is active; KVM supports 6206 * VMFUNC for nested VMs, but not for L1. 6207 */ 6208 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 6209 kvm_queue_exception(vcpu, UD_VECTOR); 6210 return 1; 6211 } 6212 6213 vmcs12 = get_vmcs12(vcpu); 6214 6215 /* 6216 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 6217 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 6218 */ 6219 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 6220 kvm_queue_exception(vcpu, UD_VECTOR); 6221 return 1; 6222 } 6223 6224 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 6225 goto fail; 6226 6227 switch (function) { 6228 case 0: 6229 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 6230 goto fail; 6231 break; 6232 default: 6233 goto fail; 6234 } 6235 return kvm_skip_emulated_instruction(vcpu); 6236 6237 fail: 6238 /* 6239 * This is effectively a reflected VM-Exit, as opposed to a synthesized 6240 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 6241 * EXIT_REASON_VMFUNC as the exit reason. 6242 */ 6243 nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, 6244 vmx_get_intr_info(vcpu), 6245 vmx_get_exit_qual(vcpu)); 6246 return 1; 6247 } 6248 6249 /* 6250 * Return true if an IO instruction with the specified port and size should cause 6251 * a VM-exit into L1. 6252 */ 6253 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 6254 int size) 6255 { 6256 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6257 gpa_t bitmap, last_bitmap; 6258 u8 b; 6259 6260 last_bitmap = INVALID_GPA; 6261 b = -1; 6262 6263 while (size > 0) { 6264 if (port < 0x8000) 6265 bitmap = vmcs12->io_bitmap_a; 6266 else if (port < 0x10000) 6267 bitmap = vmcs12->io_bitmap_b; 6268 else 6269 return true; 6270 bitmap += (port & 0x7fff) / 8; 6271 6272 if (last_bitmap != bitmap) 6273 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 6274 return true; 6275 if (b & (1 << (port & 7))) 6276 return true; 6277 6278 port++; 6279 size--; 6280 last_bitmap = bitmap; 6281 } 6282 6283 return false; 6284 } 6285 6286 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 6287 struct vmcs12 *vmcs12) 6288 { 6289 unsigned long exit_qualification; 6290 unsigned short port; 6291 int size; 6292 6293 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6294 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6295 6296 exit_qualification = vmx_get_exit_qual(vcpu); 6297 6298 port = exit_qualification >> 16; 6299 size = (exit_qualification & 7) + 1; 6300 6301 return nested_vmx_check_io_bitmaps(vcpu, port, size); 6302 } 6303 6304 /* 6305 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 6306 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6307 * disinterest in the current event (read or write a specific MSR) by using an 6308 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6309 */ 6310 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6311 struct vmcs12 *vmcs12, 6312 union vmx_exit_reason exit_reason) 6313 { 6314 u32 msr_index; 6315 gpa_t bitmap; 6316 6317 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6318 return true; 6319 6320 if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM || 6321 exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6322 msr_index = vmx_get_exit_qual(vcpu); 6323 else 6324 msr_index = kvm_rcx_read(vcpu); 6325 6326 /* 6327 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6328 * for the four combinations of read/write and low/high MSR numbers. 6329 * First we need to figure out which of the four to use: 6330 */ 6331 bitmap = vmcs12->msr_bitmap; 6332 if (exit_reason.basic == EXIT_REASON_MSR_WRITE || 6333 exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6334 bitmap += 2048; 6335 if (msr_index >= 0xc0000000) { 6336 msr_index -= 0xc0000000; 6337 bitmap += 1024; 6338 } 6339 6340 /* Then read the msr_index'th bit from this bitmap: */ 6341 if (msr_index < 1024*8) { 6342 unsigned char b; 6343 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6344 return true; 6345 return 1 & (b >> (msr_index & 7)); 6346 } else 6347 return true; /* let L1 handle the wrong parameter */ 6348 } 6349 6350 /* 6351 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6352 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6353 * intercept (via guest_host_mask etc.) the current event. 6354 */ 6355 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6356 struct vmcs12 *vmcs12) 6357 { 6358 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6359 int cr = exit_qualification & 15; 6360 int reg; 6361 unsigned long val; 6362 6363 switch ((exit_qualification >> 4) & 3) { 6364 case 0: /* mov to cr */ 6365 reg = (exit_qualification >> 8) & 15; 6366 val = kvm_register_read(vcpu, reg); 6367 switch (cr) { 6368 case 0: 6369 if (vmcs12->cr0_guest_host_mask & 6370 (val ^ vmcs12->cr0_read_shadow)) 6371 return true; 6372 break; 6373 case 3: 6374 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6375 return true; 6376 break; 6377 case 4: 6378 if (vmcs12->cr4_guest_host_mask & 6379 (vmcs12->cr4_read_shadow ^ val)) 6380 return true; 6381 break; 6382 case 8: 6383 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6384 return true; 6385 break; 6386 } 6387 break; 6388 case 2: /* clts */ 6389 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6390 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6391 return true; 6392 break; 6393 case 1: /* mov from cr */ 6394 switch (cr) { 6395 case 3: 6396 if (vmcs12->cpu_based_vm_exec_control & 6397 CPU_BASED_CR3_STORE_EXITING) 6398 return true; 6399 break; 6400 case 8: 6401 if (vmcs12->cpu_based_vm_exec_control & 6402 CPU_BASED_CR8_STORE_EXITING) 6403 return true; 6404 break; 6405 } 6406 break; 6407 case 3: /* lmsw */ 6408 /* 6409 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6410 * cr0. Other attempted changes are ignored, with no exit. 6411 */ 6412 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6413 if (vmcs12->cr0_guest_host_mask & 0xe & 6414 (val ^ vmcs12->cr0_read_shadow)) 6415 return true; 6416 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6417 !(vmcs12->cr0_read_shadow & 0x1) && 6418 (val & 0x1)) 6419 return true; 6420 break; 6421 } 6422 return false; 6423 } 6424 6425 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6426 struct vmcs12 *vmcs12) 6427 { 6428 u32 encls_leaf; 6429 6430 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || 6431 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6432 return false; 6433 6434 encls_leaf = kvm_rax_read(vcpu); 6435 if (encls_leaf > 62) 6436 encls_leaf = 63; 6437 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6438 } 6439 6440 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6441 struct vmcs12 *vmcs12, gpa_t bitmap) 6442 { 6443 u32 vmx_instruction_info; 6444 unsigned long field; 6445 u8 b; 6446 6447 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6448 return true; 6449 6450 /* Decode instruction info and find the field to access */ 6451 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6452 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6453 6454 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6455 if (field >> 15) 6456 return true; 6457 6458 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6459 return true; 6460 6461 return 1 & (b >> (field & 7)); 6462 } 6463 6464 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6465 { 6466 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6467 6468 if (nested_cpu_has_mtf(vmcs12)) 6469 return true; 6470 6471 /* 6472 * An MTF VM-exit may be injected into the guest by setting the 6473 * interruption-type to 7 (other event) and the vector field to 0. Such 6474 * is the case regardless of the 'monitor trap flag' VM-execution 6475 * control. 6476 */ 6477 return entry_intr_info == (INTR_INFO_VALID_MASK 6478 | INTR_TYPE_OTHER_EVENT); 6479 } 6480 6481 /* 6482 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6483 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6484 */ 6485 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6486 union vmx_exit_reason exit_reason) 6487 { 6488 u32 intr_info; 6489 6490 switch ((u16)exit_reason.basic) { 6491 case EXIT_REASON_EXCEPTION_NMI: 6492 intr_info = vmx_get_intr_info(vcpu); 6493 if (is_nmi(intr_info)) 6494 return true; 6495 else if (is_page_fault(intr_info)) 6496 return vcpu->arch.apf.host_apf_flags || 6497 vmx_need_pf_intercept(vcpu); 6498 else if (is_debug(intr_info) && 6499 vcpu->guest_debug & 6500 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6501 return true; 6502 else if (is_breakpoint(intr_info) && 6503 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6504 return true; 6505 else if (is_alignment_check(intr_info) && 6506 !vmx_guest_inject_ac(vcpu)) 6507 return true; 6508 else if (is_ve_fault(intr_info)) 6509 return true; 6510 return false; 6511 case EXIT_REASON_EXTERNAL_INTERRUPT: 6512 return true; 6513 case EXIT_REASON_MCE_DURING_VMENTRY: 6514 return true; 6515 case EXIT_REASON_EPT_VIOLATION: 6516 /* 6517 * L0 always deals with the EPT violation. If nested EPT is 6518 * used, and the nested mmu code discovers that the address is 6519 * missing in the guest EPT table (EPT12), the EPT violation 6520 * will be injected with nested_ept_inject_page_fault() 6521 */ 6522 return true; 6523 case EXIT_REASON_EPT_MISCONFIG: 6524 /* 6525 * L2 never uses directly L1's EPT, but rather L0's own EPT 6526 * table (shadow on EPT) or a merged EPT table that L0 built 6527 * (EPT on EPT). So any problems with the structure of the 6528 * table is L0's fault. 6529 */ 6530 return true; 6531 case EXIT_REASON_PREEMPTION_TIMER: 6532 return true; 6533 case EXIT_REASON_PML_FULL: 6534 /* 6535 * PML is emulated for an L1 VMM and should never be enabled in 6536 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6537 */ 6538 return true; 6539 case EXIT_REASON_VMFUNC: 6540 /* VM functions are emulated through L2->L0 vmexits. */ 6541 return true; 6542 case EXIT_REASON_BUS_LOCK: 6543 /* 6544 * At present, bus lock VM exit is never exposed to L1. 6545 * Handle L2's bus locks in L0 directly. 6546 */ 6547 return true; 6548 #ifdef CONFIG_KVM_HYPERV 6549 case EXIT_REASON_VMCALL: 6550 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6551 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6552 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6553 kvm_hv_is_tlb_flush_hcall(vcpu); 6554 #endif 6555 default: 6556 break; 6557 } 6558 return false; 6559 } 6560 6561 /* 6562 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6563 * is_guest_mode (L2). 6564 */ 6565 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6566 union vmx_exit_reason exit_reason) 6567 { 6568 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6569 u32 intr_info; 6570 6571 switch ((u16)exit_reason.basic) { 6572 case EXIT_REASON_EXCEPTION_NMI: 6573 intr_info = vmx_get_intr_info(vcpu); 6574 if (is_nmi(intr_info)) 6575 return true; 6576 else if (is_page_fault(intr_info)) 6577 return true; 6578 return vmcs12->exception_bitmap & 6579 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6580 case EXIT_REASON_EXTERNAL_INTERRUPT: 6581 return nested_exit_on_intr(vcpu); 6582 case EXIT_REASON_TRIPLE_FAULT: 6583 return true; 6584 case EXIT_REASON_INTERRUPT_WINDOW: 6585 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6586 case EXIT_REASON_NMI_WINDOW: 6587 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6588 case EXIT_REASON_TASK_SWITCH: 6589 return true; 6590 case EXIT_REASON_CPUID: 6591 return true; 6592 case EXIT_REASON_HLT: 6593 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6594 case EXIT_REASON_INVD: 6595 return true; 6596 case EXIT_REASON_INVLPG: 6597 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6598 case EXIT_REASON_RDPMC: 6599 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6600 case EXIT_REASON_RDRAND: 6601 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6602 case EXIT_REASON_RDSEED: 6603 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6604 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6605 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6606 case EXIT_REASON_VMREAD: 6607 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6608 vmcs12->vmread_bitmap); 6609 case EXIT_REASON_VMWRITE: 6610 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6611 vmcs12->vmwrite_bitmap); 6612 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6613 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6614 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6615 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6616 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6617 /* 6618 * VMX instructions trap unconditionally. This allows L1 to 6619 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6620 */ 6621 return true; 6622 case EXIT_REASON_CR_ACCESS: 6623 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6624 case EXIT_REASON_DR_ACCESS: 6625 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6626 case EXIT_REASON_IO_INSTRUCTION: 6627 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6628 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6629 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6630 case EXIT_REASON_MSR_READ: 6631 case EXIT_REASON_MSR_WRITE: 6632 case EXIT_REASON_MSR_READ_IMM: 6633 case EXIT_REASON_MSR_WRITE_IMM: 6634 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6635 case EXIT_REASON_INVALID_STATE: 6636 return true; 6637 case EXIT_REASON_MWAIT_INSTRUCTION: 6638 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6639 case EXIT_REASON_MONITOR_TRAP_FLAG: 6640 return nested_vmx_exit_handled_mtf(vmcs12); 6641 case EXIT_REASON_MONITOR_INSTRUCTION: 6642 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6643 case EXIT_REASON_PAUSE_INSTRUCTION: 6644 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6645 nested_cpu_has2(vmcs12, 6646 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6647 case EXIT_REASON_MCE_DURING_VMENTRY: 6648 return true; 6649 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6650 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6651 case EXIT_REASON_APIC_ACCESS: 6652 case EXIT_REASON_APIC_WRITE: 6653 case EXIT_REASON_EOI_INDUCED: 6654 /* 6655 * The controls for "virtualize APIC accesses," "APIC- 6656 * register virtualization," and "virtual-interrupt 6657 * delivery" only come from vmcs12. 6658 */ 6659 return true; 6660 case EXIT_REASON_INVPCID: 6661 return 6662 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6663 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6664 case EXIT_REASON_WBINVD: 6665 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6666 case EXIT_REASON_XSETBV: 6667 return true; 6668 case EXIT_REASON_XSAVES: 6669 case EXIT_REASON_XRSTORS: 6670 /* 6671 * Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize 6672 * XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap 6673 * verbatim, i.e. any exit is due to L1's bitmap. WARN if 6674 * XSAVES isn't enabled, as the CPU is supposed to inject #UD 6675 * in that case, before consulting the XSS-bitmap. 6676 */ 6677 WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES)); 6678 return true; 6679 case EXIT_REASON_UMWAIT: 6680 case EXIT_REASON_TPAUSE: 6681 return nested_cpu_has2(vmcs12, 6682 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6683 case EXIT_REASON_ENCLS: 6684 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6685 case EXIT_REASON_NOTIFY: 6686 /* Notify VM exit is not exposed to L1 */ 6687 return false; 6688 case EXIT_REASON_SEAMCALL: 6689 case EXIT_REASON_TDCALL: 6690 /* 6691 * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't 6692 * virtualized by KVM for L1 hypervisors, i.e. L1 should 6693 * never want or expect such an exit. 6694 */ 6695 return false; 6696 default: 6697 return true; 6698 } 6699 } 6700 6701 /* 6702 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6703 * reflected into L1. 6704 */ 6705 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6706 { 6707 struct vcpu_vmx *vmx = to_vmx(vcpu); 6708 union vmx_exit_reason exit_reason = vmx->vt.exit_reason; 6709 unsigned long exit_qual; 6710 u32 exit_intr_info; 6711 6712 kvm_warn_on_nested_run_pending(vcpu); 6713 6714 /* 6715 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6716 * has already loaded L2's state. 6717 */ 6718 if (unlikely(vmx->fail)) { 6719 trace_kvm_nested_vmenter_failed( 6720 "hardware VM-instruction error: ", 6721 vmcs_read32(VM_INSTRUCTION_ERROR)); 6722 exit_intr_info = 0; 6723 exit_qual = 0; 6724 goto reflect_vmexit; 6725 } 6726 6727 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6728 6729 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6730 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6731 return false; 6732 6733 /* If L1 doesn't want the exit, handle it in L0. */ 6734 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6735 return false; 6736 6737 /* 6738 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6739 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6740 * need to be synthesized by querying the in-kernel LAPIC, but external 6741 * interrupts are never reflected to L1 so it's a non-issue. 6742 */ 6743 exit_intr_info = vmx_get_intr_info(vcpu); 6744 if (is_exception_with_error_code(exit_intr_info)) { 6745 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6746 6747 vmcs12->vm_exit_intr_error_code = 6748 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6749 } 6750 exit_qual = vmx_get_exit_qual(vcpu); 6751 6752 reflect_vmexit: 6753 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6754 return true; 6755 } 6756 6757 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6758 struct kvm_nested_state __user *user_kvm_nested_state, 6759 u32 user_data_size) 6760 { 6761 struct vcpu_vmx *vmx; 6762 struct vmcs12 *vmcs12; 6763 struct kvm_nested_state kvm_state = { 6764 .flags = 0, 6765 .format = KVM_STATE_NESTED_FORMAT_VMX, 6766 .size = sizeof(kvm_state), 6767 .hdr.vmx.flags = 0, 6768 .hdr.vmx.vmxon_pa = INVALID_GPA, 6769 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6770 .hdr.vmx.preemption_timer_deadline = 0, 6771 }; 6772 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6773 &user_kvm_nested_state->data.vmx[0]; 6774 6775 if (!vcpu) 6776 return kvm_state.size + sizeof(*user_vmx_nested_state); 6777 6778 vmx = to_vmx(vcpu); 6779 vmcs12 = get_vmcs12(vcpu); 6780 6781 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && 6782 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6783 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6784 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6785 6786 if (vmx_has_valid_vmcs12(vcpu)) { 6787 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6788 6789 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6790 if (nested_vmx_is_evmptr12_set(vmx)) 6791 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6792 6793 if (is_guest_mode(vcpu) && 6794 nested_cpu_has_shadow_vmcs(vmcs12) && 6795 vmcs12->vmcs_link_pointer != INVALID_GPA) 6796 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6797 } 6798 6799 if (vmx->nested.smm.vmxon) 6800 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6801 6802 if (vmx->nested.smm.guest_mode) 6803 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6804 6805 if (is_guest_mode(vcpu)) { 6806 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6807 6808 if (vcpu->arch.nested_run_pending) 6809 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6810 6811 if (vmx->nested.mtf_pending) 6812 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6813 6814 if (nested_cpu_has_preemption_timer(vmcs12) && 6815 vmx->nested.has_preemption_timer_deadline) { 6816 kvm_state.hdr.vmx.flags |= 6817 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6818 kvm_state.hdr.vmx.preemption_timer_deadline = 6819 vmx->nested.preemption_timer_deadline; 6820 } 6821 } 6822 } 6823 6824 if (user_data_size < kvm_state.size) 6825 goto out; 6826 6827 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6828 return -EFAULT; 6829 6830 if (!vmx_has_valid_vmcs12(vcpu)) 6831 goto out; 6832 6833 /* 6834 * When running L2, the authoritative vmcs12 state is in the 6835 * vmcs02. When running L1, the authoritative vmcs12 state is 6836 * in the shadow or enlightened vmcs linked to vmcs01, unless 6837 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6838 * vmcs12 state is in the vmcs12 already. 6839 */ 6840 if (is_guest_mode(vcpu)) { 6841 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6842 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6843 } else { 6844 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6845 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6846 if (nested_vmx_is_evmptr12_valid(vmx)) 6847 /* 6848 * L1 hypervisor is not obliged to keep eVMCS 6849 * clean fields data always up-to-date while 6850 * not in guest mode, 'hv_clean_fields' is only 6851 * supposed to be actual upon vmentry so we need 6852 * to ignore it here and do full copy. 6853 */ 6854 copy_enlightened_to_vmcs12(vmx, 0); 6855 else if (enable_shadow_vmcs) 6856 copy_shadow_to_vmcs12(vmx); 6857 } 6858 } 6859 6860 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6861 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6862 6863 /* 6864 * Copy over the full allocated size of vmcs12 rather than just the size 6865 * of the struct. 6866 */ 6867 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6868 return -EFAULT; 6869 6870 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6871 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6872 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6873 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6874 return -EFAULT; 6875 } 6876 out: 6877 return kvm_state.size; 6878 } 6879 6880 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6881 { 6882 if (is_guest_mode(vcpu)) { 6883 vcpu->arch.nested_run_pending = 0; 6884 nested_vmx_vmexit(vcpu, -1, 0, 0); 6885 } 6886 free_nested(vcpu); 6887 } 6888 6889 int nested_vmx_check_restored_vmcs12(struct kvm_vcpu *vcpu) 6890 { 6891 enum vm_entry_failure_code ignored; 6892 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6893 6894 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6895 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6896 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6897 6898 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6899 !shadow_vmcs12->hdr.shadow_vmcs) 6900 return -EINVAL; 6901 } 6902 6903 if (nested_vmx_check_controls(vcpu, vmcs12) || 6904 nested_vmx_check_host_state(vcpu, vmcs12) || 6905 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6906 return -EINVAL; 6907 6908 return 0; 6909 } 6910 6911 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6912 struct kvm_nested_state __user *user_kvm_nested_state, 6913 struct kvm_nested_state *kvm_state) 6914 { 6915 struct vcpu_vmx *vmx = to_vmx(vcpu); 6916 struct vmcs12 *vmcs12; 6917 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6918 &user_kvm_nested_state->data.vmx[0]; 6919 int ret; 6920 6921 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6922 return -EINVAL; 6923 6924 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6925 if (kvm_state->hdr.vmx.smm.flags) 6926 return -EINVAL; 6927 6928 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6929 return -EINVAL; 6930 6931 /* 6932 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6933 * enable eVMCS capability on vCPU. However, since then 6934 * code was changed such that flag signals vmcs12 should 6935 * be copied into eVMCS in guest memory. 6936 * 6937 * To preserve backwards compatibility, allow user 6938 * to set this flag even when there is no VMXON region. 6939 */ 6940 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6941 return -EINVAL; 6942 } else { 6943 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 6944 return -EINVAL; 6945 6946 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6947 return -EINVAL; 6948 } 6949 6950 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6951 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6952 return -EINVAL; 6953 6954 if (kvm_state->hdr.vmx.smm.flags & 6955 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6956 return -EINVAL; 6957 6958 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6959 return -EINVAL; 6960 6961 /* 6962 * SMM temporarily disables VMX, so we cannot be in guest mode, 6963 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6964 * must be zero. 6965 */ 6966 if (is_smm(vcpu) ? 6967 (kvm_state->flags & 6968 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6969 : kvm_state->hdr.vmx.smm.flags) 6970 return -EINVAL; 6971 6972 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6973 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6974 return -EINVAL; 6975 6976 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6977 (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || 6978 !vmx->nested.enlightened_vmcs_enabled)) 6979 return -EINVAL; 6980 6981 vmx_leave_nested(vcpu); 6982 6983 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6984 return 0; 6985 6986 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6987 ret = enter_vmx_operation(vcpu); 6988 if (ret) 6989 return ret; 6990 6991 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6992 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6993 /* See vmx_has_valid_vmcs12. */ 6994 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6995 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6996 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6997 return -EINVAL; 6998 else 6999 return 0; 7000 } 7001 7002 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 7003 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 7004 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 7005 return -EINVAL; 7006 7007 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 7008 #ifdef CONFIG_KVM_HYPERV 7009 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 7010 /* 7011 * nested_vmx_handle_enlightened_vmptrld() cannot be called 7012 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 7013 * restored yet. EVMCS will be mapped from 7014 * nested_get_vmcs12_pages(). 7015 */ 7016 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 7017 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 7018 #endif 7019 } else { 7020 return -EINVAL; 7021 } 7022 7023 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 7024 vmx->nested.smm.vmxon = true; 7025 vmx->nested.vmxon = false; 7026 7027 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 7028 vmx->nested.smm.guest_mode = true; 7029 } 7030 7031 vmcs12 = get_vmcs12(vcpu); 7032 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 7033 return -EFAULT; 7034 7035 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 7036 return -EINVAL; 7037 7038 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 7039 return 0; 7040 7041 if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) 7042 vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED; 7043 else 7044 vcpu->arch.nested_run_pending = 0; 7045 7046 vmx->nested.mtf_pending = 7047 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 7048 7049 if (nested_cpu_has_shadow_vmcs(vmcs12) && 7050 vmcs12->vmcs_link_pointer != INVALID_GPA) { 7051 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 7052 7053 ret = -EINVAL; 7054 if (kvm_state->size < 7055 sizeof(*kvm_state) + 7056 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 7057 goto error_guest_mode; 7058 7059 ret = -EFAULT; 7060 if (copy_from_user(shadow_vmcs12, 7061 user_vmx_nested_state->shadow_vmcs12, 7062 sizeof(*shadow_vmcs12))) 7063 goto error_guest_mode; 7064 } 7065 7066 vmx->nested.has_preemption_timer_deadline = false; 7067 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 7068 vmx->nested.has_preemption_timer_deadline = true; 7069 vmx->nested.preemption_timer_deadline = 7070 kvm_state->hdr.vmx.preemption_timer_deadline; 7071 } 7072 7073 ret = nested_vmx_check_restored_vmcs12(vcpu); 7074 if (ret < 0) 7075 goto error_guest_mode; 7076 7077 vmx->nested.dirty_vmcs12 = true; 7078 vmx->nested.force_msr_bitmap_recalc = true; 7079 ret = nested_vmx_enter_non_root_mode(vcpu, false); 7080 if (ret) 7081 goto error_guest_mode; 7082 7083 if (vmx->nested.mtf_pending) 7084 kvm_make_request(KVM_REQ_EVENT, vcpu); 7085 7086 return 0; 7087 7088 error_guest_mode: 7089 vcpu->arch.nested_run_pending = 0; 7090 return ret; 7091 } 7092 7093 void nested_vmx_set_vmcs_shadowing_bitmap(void) 7094 { 7095 if (enable_shadow_vmcs) { 7096 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 7097 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 7098 } 7099 } 7100 7101 static u64 nested_vmx_calc_vmcs_enum_msr(void) 7102 { 7103 /* 7104 * Note these are the so called "index" of the VMCS field encoding, not 7105 * the index into vmcs12. 7106 */ 7107 unsigned int max_idx, idx; 7108 int i; 7109 7110 /* 7111 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 7112 * vmcs12, regardless of whether or not the associated feature is 7113 * exposed to L1. Simply find the field with the highest index. 7114 */ 7115 max_idx = 0; 7116 for (i = 0; i < nr_vmcs12_fields; i++) { 7117 /* The vmcs12 table is very, very sparsely populated. */ 7118 if (!vmcs12_field_offsets[i]) 7119 continue; 7120 7121 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 7122 if (idx > max_idx) 7123 max_idx = idx; 7124 } 7125 7126 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 7127 } 7128 7129 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 7130 struct nested_vmx_msrs *msrs) 7131 { 7132 msrs->pinbased_ctls_low = 7133 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7134 7135 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 7136 msrs->pinbased_ctls_high &= 7137 PIN_BASED_EXT_INTR_MASK | 7138 PIN_BASED_NMI_EXITING | 7139 PIN_BASED_VIRTUAL_NMIS | 7140 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 7141 msrs->pinbased_ctls_high |= 7142 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7143 PIN_BASED_VMX_PREEMPTION_TIMER; 7144 } 7145 7146 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 7147 struct nested_vmx_msrs *msrs) 7148 { 7149 msrs->exit_ctls_low = 7150 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 7151 7152 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 7153 msrs->exit_ctls_high &= 7154 #ifdef CONFIG_X86_64 7155 VM_EXIT_HOST_ADDR_SPACE_SIZE | 7156 #endif 7157 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 7158 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE; 7159 msrs->exit_ctls_high |= 7160 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 7161 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 7162 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 7163 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 7164 7165 if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7166 !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7167 msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE; 7168 7169 /* We support free control of debug control saving. */ 7170 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 7171 } 7172 7173 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 7174 struct nested_vmx_msrs *msrs) 7175 { 7176 msrs->entry_ctls_low = 7177 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 7178 7179 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 7180 msrs->entry_ctls_high &= 7181 #ifdef CONFIG_X86_64 7182 VM_ENTRY_IA32E_MODE | 7183 #endif 7184 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 7185 VM_ENTRY_LOAD_CET_STATE; 7186 msrs->entry_ctls_high |= 7187 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 7188 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 7189 7190 if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7191 !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7192 msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE; 7193 7194 /* We support free control of debug control loading. */ 7195 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 7196 } 7197 7198 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 7199 struct nested_vmx_msrs *msrs) 7200 { 7201 msrs->procbased_ctls_low = 7202 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7203 7204 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 7205 msrs->procbased_ctls_high &= 7206 CPU_BASED_INTR_WINDOW_EXITING | 7207 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 7208 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 7209 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 7210 CPU_BASED_CR3_STORE_EXITING | 7211 #ifdef CONFIG_X86_64 7212 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 7213 #endif 7214 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 7215 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 7216 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 7217 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 7218 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 7219 /* 7220 * We can allow some features even when not supported by the 7221 * hardware. For example, L1 can specify an MSR bitmap - and we 7222 * can use it to avoid exits to L1 - even when L0 runs L2 7223 * without MSR bitmaps. 7224 */ 7225 msrs->procbased_ctls_high |= 7226 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7227 CPU_BASED_USE_MSR_BITMAPS; 7228 7229 /* We support free control of CR3 access interception. */ 7230 msrs->procbased_ctls_low &= 7231 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 7232 } 7233 7234 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 7235 struct vmcs_config *vmcs_conf, 7236 struct nested_vmx_msrs *msrs) 7237 { 7238 msrs->secondary_ctls_low = 0; 7239 7240 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 7241 msrs->secondary_ctls_high &= 7242 SECONDARY_EXEC_DESC | 7243 SECONDARY_EXEC_ENABLE_RDTSCP | 7244 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7245 SECONDARY_EXEC_WBINVD_EXITING | 7246 SECONDARY_EXEC_APIC_REGISTER_VIRT | 7247 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 7248 SECONDARY_EXEC_RDRAND_EXITING | 7249 SECONDARY_EXEC_ENABLE_INVPCID | 7250 SECONDARY_EXEC_ENABLE_VMFUNC | 7251 SECONDARY_EXEC_RDSEED_EXITING | 7252 SECONDARY_EXEC_ENABLE_XSAVES | 7253 SECONDARY_EXEC_TSC_SCALING | 7254 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 7255 7256 /* 7257 * We can emulate "VMCS shadowing," even if the hardware 7258 * doesn't support it. 7259 */ 7260 msrs->secondary_ctls_high |= 7261 SECONDARY_EXEC_SHADOW_VMCS; 7262 7263 if (enable_ept) { 7264 /* nested EPT: emulate EPT also to L1 */ 7265 msrs->secondary_ctls_high |= 7266 SECONDARY_EXEC_ENABLE_EPT; 7267 msrs->ept_caps = 7268 VMX_EPT_PAGE_WALK_4_BIT | 7269 VMX_EPT_PAGE_WALK_5_BIT | 7270 VMX_EPTP_WB_BIT | 7271 VMX_EPT_INVEPT_BIT | 7272 VMX_EPT_EXECUTE_ONLY_BIT | 7273 VMX_EPT_ADVANCED_VMEXIT_INFO_BIT; 7274 7275 msrs->ept_caps &= ept_caps; 7276 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 7277 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 7278 VMX_EPT_1GB_PAGE_BIT; 7279 if (enable_ept_ad_bits) { 7280 msrs->secondary_ctls_high |= 7281 SECONDARY_EXEC_ENABLE_PML; 7282 msrs->ept_caps |= VMX_EPT_AD_BIT; 7283 } 7284 7285 if (enable_mbec) 7286 msrs->secondary_ctls_high |= 7287 SECONDARY_EXEC_MODE_BASED_EPT_EXEC; 7288 /* 7289 * Advertise EPTP switching irrespective of hardware support, 7290 * KVM emulates it in software so long as VMFUNC is supported. 7291 */ 7292 if (cpu_has_vmx_vmfunc()) 7293 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 7294 } 7295 7296 /* 7297 * Old versions of KVM use the single-context version without 7298 * checking for support, so declare that it is supported even 7299 * though it is treated as global context. The alternative is 7300 * not failing the single-context invvpid, and it is worse. 7301 */ 7302 if (enable_vpid) { 7303 msrs->secondary_ctls_high |= 7304 SECONDARY_EXEC_ENABLE_VPID; 7305 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 7306 VMX_VPID_EXTENT_SUPPORTED_MASK; 7307 } 7308 7309 if (enable_unrestricted_guest) 7310 msrs->secondary_ctls_high |= 7311 SECONDARY_EXEC_UNRESTRICTED_GUEST; 7312 7313 if (flexpriority_enabled) 7314 msrs->secondary_ctls_high |= 7315 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7316 7317 if (enable_sgx) 7318 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 7319 } 7320 7321 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 7322 struct nested_vmx_msrs *msrs) 7323 { 7324 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7325 msrs->misc_low |= 7326 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7327 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7328 VMX_MISC_ACTIVITY_HLT | 7329 VMX_MISC_ACTIVITY_WAIT_SIPI; 7330 msrs->misc_high = 0; 7331 } 7332 7333 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 7334 { 7335 /* 7336 * This MSR reports some information about VMX support. We 7337 * should return information about the VMX we emulate for the 7338 * guest, and the VMCS structure we give it - not about the 7339 * VMX support of the underlying hardware. 7340 */ 7341 msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, 7342 X86_MEMTYPE_WB); 7343 7344 msrs->basic |= VMX_BASIC_TRUE_CTLS; 7345 if (cpu_has_vmx_basic_inout()) 7346 msrs->basic |= VMX_BASIC_INOUT; 7347 if (cpu_has_vmx_basic_no_hw_errcode_cc()) 7348 msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC; 7349 } 7350 7351 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 7352 { 7353 /* 7354 * These MSRs specify bits which the guest must keep fixed on 7355 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 7356 * We picked the standard core2 setting. 7357 */ 7358 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 7359 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 7360 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 7361 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 7362 7363 /* These MSRs specify bits which the guest must keep fixed off. */ 7364 rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 7365 rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 7366 7367 if (vmx_umip_emulated()) 7368 msrs->cr4_fixed1 |= X86_CR4_UMIP; 7369 } 7370 7371 /* 7372 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 7373 * returned for the various VMX controls MSRs when nested VMX is enabled. 7374 * The same values should also be used to verify that vmcs12 control fields are 7375 * valid during nested entry from L1 to L2. 7376 * Each of these control msrs has a low and high 32-bit half: A low bit is on 7377 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7378 * bit in the high half is on if the corresponding bit in the control field 7379 * may be on. See also vmx_control_verify(). 7380 */ 7381 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7382 { 7383 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7384 7385 /* 7386 * Note that as a general rule, the high half of the MSRs (bits in 7387 * the control fields which may be 1) should be initialized by the 7388 * intersection of the underlying hardware's MSR (i.e., features which 7389 * can be supported) and the list of features we want to expose - 7390 * because they are known to be properly supported in our code. 7391 * Also, usually, the low half of the MSRs (bits which must be 1) can 7392 * be set to 0, meaning that L1 may turn off any of these bits. The 7393 * reason is that if one of these bits is necessary, it will appear 7394 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7395 * fields of vmcs01 and vmcs02, will turn these bits off - and 7396 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7397 * These rules have exceptions below. 7398 */ 7399 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7400 7401 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7402 7403 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7404 7405 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7406 7407 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7408 7409 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7410 7411 nested_vmx_setup_basic(msrs); 7412 7413 nested_vmx_setup_cr_fixed(msrs); 7414 7415 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7416 } 7417 7418 void nested_vmx_hardware_unsetup(void) 7419 { 7420 int i; 7421 7422 if (enable_shadow_vmcs) { 7423 for (i = 0; i < VMX_BITMAP_NR; i++) 7424 free_page((unsigned long)vmx_bitmap[i]); 7425 } 7426 } 7427 7428 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7429 { 7430 int i; 7431 7432 /* 7433 * Note! The set of supported vmcs12 fields is consumed by both VMX 7434 * MSR and shadow VMCS setup. 7435 */ 7436 nested_vmx_setup_vmcs12_fields(); 7437 7438 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 7439 7440 if (!cpu_has_vmx_shadow_vmcs()) 7441 enable_shadow_vmcs = 0; 7442 if (enable_shadow_vmcs) { 7443 for (i = 0; i < VMX_BITMAP_NR; i++) { 7444 /* 7445 * The vmx_bitmap is not tied to a VM and so should 7446 * not be charged to a memcg. 7447 */ 7448 vmx_bitmap[i] = (unsigned long *) 7449 __get_free_page(GFP_KERNEL); 7450 if (!vmx_bitmap[i]) { 7451 nested_vmx_hardware_unsetup(); 7452 return -ENOMEM; 7453 } 7454 } 7455 7456 init_vmcs_shadow_fields(); 7457 } 7458 7459 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7460 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7461 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7462 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7463 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7464 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7465 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7466 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7467 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7468 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7469 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7470 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7471 7472 return 0; 7473 } 7474 7475 static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, 7476 u64 access, 7477 struct x86_exception *exception, 7478 u64 pte_access) 7479 { 7480 struct kvm_mmu *mmu = vcpu->arch.mmu; 7481 7482 BUG_ON(!mmu_is_nested(vcpu)); 7483 7484 /* 7485 * MBEC differentiates based on the effective U/S bit of 7486 * the guest page tables; not the processor CPL. 7487 */ 7488 access &= ~PFERR_USER_MASK; 7489 if ((pte_access & ACC_USER_MASK) && (access & PFERR_GUEST_FINAL_MASK)) 7490 access |= PFERR_USER_MASK; 7491 7492 return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception); 7493 } 7494 7495 struct kvm_x86_nested_ops vmx_nested_ops = { 7496 .leave_nested = vmx_leave_nested, 7497 .translate_nested_gpa = vmx_translate_nested_gpa, 7498 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7499 .check_events = vmx_check_nested_events, 7500 .has_events = vmx_has_nested_events, 7501 .triple_fault = nested_vmx_triple_fault, 7502 .get_state = vmx_get_nested_state, 7503 .set_state = vmx_set_nested_state, 7504 .get_nested_state_pages = vmx_get_nested_state_pages, 7505 .write_log_dirty = nested_vmx_write_pml_buffer, 7506 #ifdef CONFIG_KVM_HYPERV 7507 .enable_evmcs = nested_enable_evmcs, 7508 .get_evmcs_version = nested_get_evmcs_version, 7509 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7510 #endif 7511 }; 7512