Lines Matching +full:supervisor +full:- +full:mode +full:- +full:visible

1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine driver for Linux
16 * Ben-Ami Yassour <benami@il.ibm.com>
48 #include <linux/user-return-notifier.h>
107 ((struct kvm_vcpu *)(ctxt)->vcpu)
110 * - enable syscall per default because its emulated by KVM
111 * - enable LME and LMA per default on 64 bit KVM
145 *(((struct kvm_x86_ops *)0)->func));
148 #include <asm/kvm-x86-ops.h>
165 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
174 * Flags to manipulate forced emulation behavior (any non-zero value will
181 int __read_mostly pi_inject_timer = -1;
461 * List of MSRs that control the existence of MSR-based features, i.e. MSRs
474 (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
566 size - useroffset, NULL);
575 vcpu->arch.apf.gfns[i] = ~0;
591 if (msrs->registered) {
592 msrs->registered = false;
597 values = &msrs->values[slot];
598 if (values->host != values->curr) {
599 wrmsrq(kvm_uret_msrs_list[slot], values->host);
600 values->curr = values->host;
625 return -1;
640 return -1;
652 msrs->values[i].host = value;
653 msrs->values[i].curr = value;
659 if (!msrs->registered) {
660 msrs->urn.on_user_return = kvm_on_user_return;
661 user_return_notifier_register(&msrs->urn);
662 msrs->registered = true;
671 value = (value & mask) | (msrs->values[slot].host & ~mask);
672 if (value == msrs->values[slot].curr)
678 msrs->values[slot].curr = value;
688 msrs->values[slot].curr = value;
695 return this_cpu_ptr(user_return_msrs)->values[slot].curr;
703 if (msrs->registered)
704 kvm_on_user_return(&msrs->urn);
758 * #DBs can be trap-like or fault-like, the caller must check other CPU
777 if (!ex->has_payload)
780 switch (ex->vector) {
783 * "Certain debug exceptions may clear bit 0-3. The
787 vcpu->arch.dr6 &= ~DR_TRAP_BITS;
796 * Active low bits should be cleared if 1-setting in payload.
797 * Active high bits should be set if 1-setting in payload.
804 vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
805 vcpu->arch.dr6 |= ex->payload;
806 vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW;
814 vcpu->arch.dr6 &= ~BIT(12);
817 vcpu->arch.cr2 = ex->payload;
821 ex->has_payload = false;
822 ex->payload = 0;
830 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
832 ex->vector = vector;
833 ex->injected = false;
834 ex->pending = true;
835 ex->has_error_code = has_error_code;
836 ex->error_code = error_code;
837 ex->has_payload = has_payload;
838 ex->payload = payload;
851 * If the exception is destined for L2, morph it to a VM-Exit if L1
855 kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
861 if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
863 vcpu->arch.exception.pending = true;
864 vcpu->arch.exception.injected = false;
866 vcpu->arch.exception.has_error_code = has_error;
867 vcpu->arch.exception.vector = nr;
868 vcpu->arch.exception.error_code = error_code;
869 vcpu->arch.exception.has_payload = has_payload;
870 vcpu->arch.exception.payload = payload;
873 &vcpu->arch.exception);
878 prev_nr = vcpu->arch.exception.vector;
880 /* triple fault -> shutdown */
892 vcpu->arch.exception.injected = false;
893 vcpu->arch.exception.pending = false;
898 that instruction re-execution will regenerate lost
929 * On VM-Entry, an exception can be pending if and only if event
939 * re-checking is incorrect if _L1_ injected the exception, in which
944 vcpu->arch.exception.injected = true;
945 vcpu->arch.exception.has_error_code = has_error_code;
946 vcpu->arch.exception.vector = nr;
947 vcpu->arch.exception.error_code = error_code;
948 vcpu->arch.exception.has_payload = false;
949 vcpu->arch.exception.payload = 0;
977 ++vcpu->stat.pf_guest;
980 * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of
983 if (is_guest_mode(vcpu) && fault->async_page_fault)
985 true, fault->error_code,
986 true, fault->address);
988 kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
989 fault->address);
996 WARN_ON_ONCE(fault->vector != PF_VECTOR);
998 fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
999 vcpu->arch.walk_mmu;
1005 if ((fault->error_code & PFERR_PRESENT_MASK) &&
1006 !(fault->error_code & PFERR_RSVD_MASK))
1007 kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address,
1010 fault_mmu->inject_page_fault(vcpu, fault);
1016 atomic_inc(&vcpu->arch.nmi_queued);
1050 return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
1058 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
1063 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
1091 if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
1092 kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
1094 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
1097 vcpu->arch.pdptrs_from_userspace = false;
1122 * CR0.WP is incorporated into the MMU role, but only for non-nested,
1169 if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
1180 if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
1208 if (vcpu->arch.guest_state_protected)
1213 if (vcpu->arch.xcr0 != kvm_host.xcr0)
1214 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
1217 vcpu->arch.ia32_xss != kvm_host.xss)
1218 wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss);
1222 vcpu->arch.pkru != vcpu->arch.host_pkru &&
1223 ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
1225 wrpkru(vcpu->arch.pkru);
1231 if (vcpu->arch.guest_state_protected)
1235 ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
1237 vcpu->arch.pkru = rdpkru();
1238 if (vcpu->arch.pkru != vcpu->arch.host_pkru)
1239 wrpkru(vcpu->arch.host_pkru);
1244 if (vcpu->arch.xcr0 != kvm_host.xcr0)
1248 vcpu->arch.ia32_xss != kvm_host.xss)
1258 return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;
1265 u64 old_xcr0 = vcpu->arch.xcr0;
1281 valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
1300 vcpu->arch.xcr0 = xcr0;
1303 vcpu->arch.cpuid_dynamic_bits_dirty = true;
1333 * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
1347 * - CR4.PCIDE is changed from 1 to 0
1348 * - CR4.PGE is toggled
1359 * - CR4.SMEP is changed from 0 to 1
1360 * - CR4.PAE is toggled
1405 struct kvm_mmu *mmu = vcpu->arch.mmu;
1440 if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
1443 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
1463 * Do not condition the GPA check on long mode, this helper is used to
1465 * the current vCPU mode is accurate.
1476 vcpu->arch.cr3 = cr3;
1485 * and it's impossible to use a non-zero PCID when PCID is disabled,
1502 vcpu->arch.cr8 = cr8;
1512 return vcpu->arch.cr8;
1520 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1522 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
1530 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1531 dr7 = vcpu->arch.guest_debug_dr7;
1533 dr7 = vcpu->arch.dr7;
1535 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
1537 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
1555 size_t size = ARRAY_SIZE(vcpu->arch.db);
1559 vcpu->arch.db[array_index_nospec(dr, size)] = val;
1560 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1561 vcpu->arch.eff_db[dr] = val;
1567 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
1573 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
1584 size_t size = ARRAY_SIZE(vcpu->arch.db);
1588 return vcpu->arch.db[array_index_nospec(dr, size)];
1591 return vcpu->arch.dr6;
1594 return vcpu->arch.dr7;
1618 * 10 - MISC_PACKAGE_CTRLS
1619 * 11 - ENERGY_FILTERING_CTL
1620 * 12 - DOITM
1621 * 18 - FB_CLEAR_CTRL
1622 * 21 - XAPIC_DISABLE_STATUS
1623 * 23 - OVERCLOCKING_STATUS
1756 u64 old_efer = vcpu->arch.efer;
1757 u64 efer = msr_info->data;
1763 if (!msr_info->host_initiated) {
1768 (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1773 efer |= vcpu->arch.efer & EFER_LMA;
1801 struct kvm *kvm = vcpu->kvm;
1810 idx = srcu_read_lock(&kvm->srcu);
1812 msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
1818 allowed = msr_filter->default_allow;
1819 ranges = msr_filter->ranges;
1821 for (i = 0; i < msr_filter->count; i++) {
1828 allowed = test_bit(index - start, bitmap);
1834 srcu_read_unlock(&kvm->srcu, idx);
1843 * Returns 0 on success, non-0 otherwise.
1864 * non-canonical address is written on Intel but not on
1865 * AMD (which ignores the top 32-bits, because it does
1866 * not implement 64-bit SYSENTER).
1868 * 64-bit code should hence be able to write a non-canonical
1870 * vmentry does not fail on Intel after writing a non-canonical
1872 * invokes 64-bit SYSENTER.
1891 * clear the bits. This ensures cross-vendor migration will
1933 /* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */
1962 * Returns 0 on success, non-0 otherwise.
2056 if (!vcpu->run->msr.error) {
2057 kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
2058 kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
2064 return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
2075 return kvm_x86_call(complete_emulated_msr)(vcpu, vcpu->run->msr.error);
2086 if (!vcpu->run->msr.error)
2087 kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg,
2088 vcpu->run->msr.data);
2113 if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
2116 vcpu->run->exit_reason = exit_reason;
2117 vcpu->run->msr.error = 0;
2118 memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
2119 vcpu->run->msr.reason = msr_reason;
2120 vcpu->run->msr.index = index;
2121 vcpu->run->msr.data = data;
2122 vcpu->arch.complete_userspace_io = completion;
2139 kvm_rax_write(vcpu, data & -1u);
2140 kvm_rdx_write(vcpu, (data >> 32) & -1u);
2157 return __kvm_emulate_rdmsr(vcpu, kvm_rcx_read(vcpu), -1,
2164 vcpu->arch.cui_rdmsr_imm_reg = reg;
2237 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS))
2240 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT))
2243 enabled = vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT;
2268 return READ_ONCE(vcpu->mode) == EXITING_GUEST_MODE ||
2276 if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) ||
2277 kvm_x2apic_icr_write_fast(vcpu->arch.apic, data))
2329 return -EINVAL;
2361 write_seqcount_begin(&vdata->seq);
2364 vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode;
2365 vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
2366 vdata->clock.mask = tk->tkr_mono.mask;
2367 vdata->clock.mult = tk->tkr_mono.mult;
2368 vdata->clock.shift = tk->tkr_mono.shift;
2369 vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec;
2370 vdata->clock.offset = tk->tkr_mono.base;
2372 vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode;
2373 vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
2374 vdata->raw_clock.mask = tk->tkr_raw.mask;
2375 vdata->raw_clock.mult = tk->tkr_raw.mult;
2376 vdata->raw_clock.shift = tk->tkr_raw.shift;
2377 vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec;
2378 vdata->raw_clock.offset = tk->tkr_raw.base;
2380 vdata->wall_time_sec = tk->xtime_sec;
2382 vdata->offs_boot = tk->offs_boot;
2384 write_seqcount_end(&vdata->seq);
2444 struct kvm_arch *ka = &vcpu->kvm->arch;
2446 if (vcpu->vcpu_id == 0 && !host_initiated) {
2447 if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
2450 ka->boot_vcpu_runs_old_kvmclock = old_msr;
2453 vcpu->arch.time = system_time;
2458 kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
2461 kvm_gpc_deactivate(&vcpu->arch.pv_time);
2484 shift--;
2529 vcpu->arch.tsc_catchup = 1;
2530 vcpu->arch.tsc_always_catchup = 1;
2534 return -1;
2538 /* TSC scaling required - calculate ratio */
2543 pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
2545 return -1;
2561 return -1;
2566 &vcpu->arch.virtual_tsc_shift,
2567 &vcpu->arch.virtual_tsc_mult);
2568 vcpu->arch.virtual_tsc_khz = user_tsc_khz;
2576 thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
2588 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
2589 vcpu->arch.virtual_tsc_mult,
2590 vcpu->arch.virtual_tsc_shift);
2591 tsc += vcpu->arch.this_tsc_write;
2596 static inline bool gtod_is_based_on_tsc(int mode)
2598 return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
2605 struct kvm_arch *ka = &vcpu->kvm->arch;
2613 bool use_master_clock = (ka->nr_vcpus_matched_tsc + 1 ==
2614 atomic_read(&vcpu->kvm->online_vcpus)) &&
2615 gtod_is_based_on_tsc(gtod->clock.vclock_mode);
2623 if ((ka->use_master_clock && new_generation) ||
2624 (ka->use_master_clock != use_master_clock))
2627 trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
2628 atomic_read(&vcpu->kvm->online_vcpus),
2629 ka->use_master_clock, gtod->clock.vclock_mode);
2636 * The most significant 64-N bits (mult) of ratio represent the
2639 * point number (mult + frac * 2^(-N)).
2662 tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
2664 return target_tsc - tsc;
2669 return vcpu->arch.l1_tsc_offset +
2670 kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
2701 if (vcpu->arch.guest_tsc_protected)
2704 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2705 vcpu->arch.l1_tsc_offset,
2708 vcpu->arch.l1_tsc_offset = l1_offset;
2716 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2721 vcpu->arch.tsc_offset = l1_offset;
2728 vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
2732 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2736 vcpu->arch.tsc_scaling_ratio = l1_multiplier;
2746 * TSC is marked unstable when we're running on Hyper-V,
2763 struct kvm *kvm = vcpu->kvm;
2765 lockdep_assert_held(&kvm->arch.tsc_write_lock);
2767 if (vcpu->arch.guest_tsc_protected)
2771 vcpu->kvm->arch.user_set_tsc = true;
2777 kvm->arch.last_tsc_nsec = ns;
2778 kvm->arch.last_tsc_write = tsc;
2779 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
2780 kvm->arch.last_tsc_offset = offset;
2782 vcpu->arch.last_guest_tsc = tsc;
2794 * These values are tracked in kvm->arch.cur_xxx variables.
2796 kvm->arch.cur_tsc_generation++;
2797 kvm->arch.cur_tsc_nsec = ns;
2798 kvm->arch.cur_tsc_write = tsc;
2799 kvm->arch.cur_tsc_offset = offset;
2800 kvm->arch.nr_vcpus_matched_tsc = 0;
2801 } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
2802 kvm->arch.nr_vcpus_matched_tsc++;
2806 vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
2807 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
2808 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
2816 struct kvm *kvm = vcpu->kvm;
2822 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
2825 elapsed = ns - kvm->arch.last_tsc_nsec;
2827 if (vcpu->arch.virtual_tsc_khz) {
2834 } else if (kvm->arch.user_set_tsc) {
2835 u64 tsc_exp = kvm->arch.last_tsc_write +
2837 u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
2839 * Here lies UAPI baggage: when a user-initiated TSC write has
2850 * come from the kernel's default vCPU creation. Make the 1-second
2866 vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
2868 offset = kvm->arch.cur_tsc_offset;
2878 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
2884 u64 tsc_offset = vcpu->arch.l1_tsc_offset;
2890 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio)
2893 vcpu->arch.l1_tsc_scaling_ratio);
2920 int *mode)
2925 switch (clock->vclock_mode) {
2930 *mode = VDSO_CLOCKMODE_HVCLOCK;
2931 v = (tsc_pg_val - clock->cycle_last) &
2932 clock->mask;
2935 *mode = VDSO_CLOCKMODE_NONE;
2939 *mode = VDSO_CLOCKMODE_TSC;
2941 v = (*tsc_timestamp - clock->cycle_last) &
2942 clock->mask;
2945 *mode = VDSO_CLOCKMODE_NONE;
2948 if (*mode == VDSO_CLOCKMODE_NONE)
2951 return v * clock->mult;
2956 * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot).
2962 int mode;
2966 seq = read_seqcount_begin(&gtod->seq);
2967 ns = gtod->raw_clock.base_cycles;
2968 ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
2969 ns >>= gtod->raw_clock.shift;
2970 ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
2971 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2974 return mode;
2985 int mode;
2989 seq = read_seqcount_begin(&gtod->seq);
2990 ns = gtod->clock.base_cycles;
2991 ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
2992 ns >>= gtod->clock.shift;
2993 ns += ktime_to_ns(gtod->clock.offset);
2994 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2997 return mode;
3004 int mode;
3008 seq = read_seqcount_begin(&gtod->seq);
3009 ts->tv_sec = gtod->wall_time_sec;
3010 ns = gtod->clock.base_cycles;
3011 ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
3012 ns >>= gtod->clock.shift;
3013 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
3015 ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
3016 ts->tv_nsec = ns;
3018 return mode;
3072 * Each numbered line represents an event visible to both
3084 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
3085 * 5. | ret1 = timespec1 + (rdtsc - tsc1)
3086 * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
3088 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
3090 * - ret0 < ret1
3091 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
3093 * - 0 < N - M => M < N
3112 struct kvm_arch *ka = &kvm->arch;
3116 lockdep_assert_held(&kvm->arch.tsc_write_lock);
3117 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
3118 atomic_read(&kvm->online_vcpus));
3125 &ka->master_kernel_ns,
3126 &ka->master_cycle_now);
3128 ka->use_master_clock = host_tsc_clocksource && vcpus_matched
3129 && !ka->backwards_tsc_observed
3130 && !ka->boot_vcpu_runs_old_kvmclock;
3132 if (ka->use_master_clock)
3136 trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
3148 raw_spin_lock_irq(&kvm->arch.tsc_write_lock);
3149 write_seqcount_begin(&kvm->arch.pvclock_sc);
3162 struct kvm_arch *ka = &kvm->arch;
3166 write_seqcount_end(&ka->pvclock_sc);
3167 raw_spin_unlock_irq(&ka->tsc_write_lock);
3186 * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz
3200 /* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */
3203 struct kvm_arch *ka = &kvm->arch;
3209 data->flags = 0;
3210 if (ka->use_master_clock &&
3215 if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
3216 data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
3217 data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
3220 data->host_tsc = rdtsc();
3222 data->flags |= KVM_CLOCK_TSC_STABLE;
3223 hv_clock.tsc_timestamp = ka->master_cycle_now;
3224 hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
3228 data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
3230 data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
3238 struct kvm_arch *ka = &kvm->arch;
3242 seq = read_seqcount_begin(&ka->pvclock_sc);
3244 } while (read_seqcount_retry(&ka->pvclock_sc, seq));
3266 read_lock_irqsave(&gpc->lock, flags);
3268 read_unlock_irqrestore(&gpc->lock, flags);
3273 read_lock_irqsave(&gpc->lock, flags);
3276 guest_hv_clock = (void *)(gpc->khva + offset);
3285 guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1;
3289 hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
3295 guest_hv_clock->version = ++hv_clock.version;
3298 read_unlock_irqrestore(&gpc->lock, flags);
3300 trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock);
3308 struct kvm_vcpu_arch *vcpu = &v->arch;
3309 struct kvm_arch *ka = &v->kvm->arch;
3322 seq = read_seqcount_begin(&ka->pvclock_sc);
3323 use_master_clock = ka->use_master_clock;
3325 host_tsc = ka->master_cycle_now;
3326 kernel_ns = ka->master_kernel_ns;
3328 } while (read_seqcount_retry(&ka->pvclock_sc, seq));
3355 if (vcpu->tsc_catchup) {
3358 adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
3369 v->arch.l1_tsc_scaling_ratio);
3373 if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
3375 &vcpu->pvclock_tsc_shift,
3376 &vcpu->pvclock_tsc_mul);
3377 vcpu->hw_tsc_khz = tgt_tsc_khz;
3380 hv_clock.tsc_shift = vcpu->pvclock_tsc_shift;
3381 hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul;
3383 hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
3384 vcpu->last_guest_tsc = tsc_timestamp;
3391 if (vcpu->pv_time.active) {
3397 if (vcpu->pvclock_set_guest_stopped_request) {
3399 vcpu->pvclock_set_guest_stopped_request = false;
3401 kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0);
3406 kvm_hv_setup_tsc_page(v->kvm, &hv_clock);
3417 if (ka->xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
3420 if (vcpu->xen.vcpu_info_cache.active)
3421 kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache,
3423 if (vcpu->xen.vcpu_time_info_cache.active)
3424 kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0);
3452 struct kvm_arch *ka = &kvm->arch;
3458 seq = read_seqcount_begin(&ka->pvclock_sc);
3461 if (!ka->use_master_clock)
3483 hv_clock.tsc_timestamp = ka->master_cycle_now;
3484 hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
3486 } while (read_seqcount_retry(&ka->pvclock_sc, seq));
3492 * since 1970-01-01.
3498 return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
3502 return ktime_get_real_ns() - get_kvmclock_ns(kvm);
3507 * vcpu->cpu migration, should not allow system_timestamp from
3513 * We need to rate-limit these requests though, as they can
3516 * by the delay we use to rate-limit the updates.
3538 struct kvm *kvm = v->kvm;
3541 schedule_delayed_work(&kvm->arch.kvmclock_update_work,
3554 schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
3555 schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
3576 return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
3583 u64 mcg_cap = vcpu->arch.mcg_cap;
3585 u32 msr = msr_info->index;
3586 u64 data = msr_info->data;
3591 vcpu->arch.mcg_status = data;
3595 (data || !msr_info->host_initiated))
3599 vcpu->arch.mcg_ctl = data;
3601 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
3602 last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
3606 if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
3611 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
3612 last_msr + 1 - MSR_IA32_MC0_CTL2);
3613 vcpu->arch.mci_ctl2_banks[offset] = data;
3615 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3616 last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
3628 * single-bit ECC data errors.
3636 * AMD-based CPUs allow non-zero values, but if and only if
3639 if (!msr_info->host_initiated && is_mci_status_msr(msr) &&
3643 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
3644 last_msr + 1 - MSR_IA32_MC0_CTL);
3645 vcpu->arch.mce_banks[offset] = data;
3657 return (vcpu->arch.apf.msr_en_val & mask) == mask;
3679 vcpu->arch.apf.msr_en_val = data;
3687 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
3691 vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS);
3692 vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
3701 /* Bits 8-63 are reserved */
3708 vcpu->arch.apf.msr_int_val = data;
3710 vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
3717 kvm_gpc_deactivate(&vcpu->arch.pv_time);
3718 vcpu->arch.time = 0;
3723 ++vcpu->stat.tlb_flush;
3732 ++vcpu->stat.tlb_flush;
3748 * Flushing all "guest" TLB is always a superset of Hyper-V's fine
3757 ++vcpu->stat.tlb_flush;
3765 * prior before nested VM-Enter/VM-Exit.
3779 struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
3782 gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
3786 if (kvm_xen_msr_enabled(vcpu->kvm)) {
3791 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3794 if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
3797 slots = kvm_memslots(vcpu->kvm);
3799 if (unlikely(slots->generation != ghc->generation ||
3800 gpa != ghc->gpa ||
3801 kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
3803 BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
3805 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
3806 kvm_is_error_hva(ghc->hva) || !ghc->memslot)
3810 st = (struct kvm_steal_time __user *)ghc->hva;
3817 int err = -EFAULT;
3828 "+m" (st->preempted));
3834 vcpu->arch.st.preempted = 0;
3836 trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
3847 unsafe_put_user(0, &st->preempted, out);
3848 vcpu->arch.st.preempted = 0;
3851 unsafe_get_user(version, &st->version, out);
3856 unsafe_put_user(version, &st->version, out);
3860 unsafe_get_user(steal, &st->steal, out);
3861 steal += current->sched_info.run_delay -
3862 vcpu->arch.st.last_steal;
3863 vcpu->arch.st.last_steal = current->sched_info.run_delay;
3864 unsafe_put_user(steal, &st->steal, out);
3867 unsafe_put_user(version, &st->version, out);
3872 mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
3879 * Because S_CET is loaded on VM-Enter and VM-Exit via dedicated VMCS fields,
3883 * PL[0-3]_SSP while executing in the kernel is safe, as U_CET is specific to
3884 * userspace, and PL[0-3]_SSP are only consumed when transitioning to lower
3904 * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an
3915 KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm);
3916 KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm);
3920 rdmsrq(msr_info->index, msr_info->data);
3922 wrmsrq(msr_info->index, msr_info->data);
3938 u32 msr = msr_info->index;
3939 u64 data = msr_info->data;
3942 * Do not allow host-initiated writes to trigger the Xen hypercall
3946 if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) &&
3947 !msr_info->host_initiated)
3962 if (msr_info->host_initiated)
3963 vcpu->arch.microcode_version = data;
3966 if (!msr_info->host_initiated ||
3969 vcpu->arch.arch_capabilities = data;
3972 if (!msr_info->host_initiated ||
3984 if (vcpu->arch.perf_capabilities == data)
3987 vcpu->arch.perf_capabilities = data;
3993 if (!msr_info->host_initiated) {
4021 if (!msr_info->host_initiated &&
4048 vcpu->arch.msr_hwcr = data;
4060 vcpu->arch.pat = data;
4066 return kvm_apic_set_base(vcpu, data, msr_info->host_initiated);
4074 if (!msr_info->host_initiated) {
4075 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
4082 vcpu->arch.ia32_tsc_adjust_msr = data;
4086 u64 old_val = vcpu->arch.ia32_misc_enable_msr;
4088 if (!msr_info->host_initiated) {
4098 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
4102 vcpu->arch.ia32_misc_enable_msr = data;
4103 vcpu->arch.cpuid_dynamic_bits_dirty = true;
4105 vcpu->arch.ia32_misc_enable_msr = data;
4110 if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
4112 vcpu->arch.smbase = data;
4115 vcpu->arch.msr_ia32_power_ctl = data;
4118 if (msr_info->host_initiated) {
4120 } else if (!vcpu->arch.guest_tsc_protected) {
4121 u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
4123 vcpu->arch.ia32_tsc_adjust_msr += adj;
4130 if (data & ~vcpu->arch.guest_supported_xss)
4132 if (vcpu->arch.ia32_xss == data)
4134 vcpu->arch.ia32_xss = data;
4135 vcpu->arch.cpuid_dynamic_bits_dirty = true;
4138 if (!msr_info->host_initiated)
4140 vcpu->arch.smi_count = data;
4146 vcpu->kvm->arch.wall_clock = data;
4147 kvm_write_wall_clock(vcpu->kvm, data, 0);
4153 vcpu->kvm->arch.wall_clock = data;
4154 kvm_write_wall_clock(vcpu->kvm, data, 0);
4160 kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
4166 kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
4186 vcpu->arch.apf.pageready_pending = false;
4200 vcpu->arch.st.msr_val = data;
4221 if (data & (-1ULL << 1))
4224 vcpu->arch.msr_kvm_poll_control = data;
4229 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4230 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4247 * all pre-dating SVM, but a recommended workaround from
4265 msr_info->host_initiated);
4268 /* Drop writes to this legacy MSR -- see rdmsr
4276 vcpu->arch.osvw.length = data;
4281 vcpu->arch.osvw.status = data;
4284 if (!msr_info->host_initiated)
4286 vcpu->arch.msr_platform_info = data;
4293 vcpu->arch.msr_misc_features_enables = data;
4297 if (!msr_info->host_initiated &&
4304 fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
4307 if (!msr_info->host_initiated &&
4314 vcpu->arch.guest_fpu.xfd_err = data;
4334 u64 mcg_cap = vcpu->arch.mcg_cap;
4344 data = vcpu->arch.mcg_cap;
4349 data = vcpu->arch.mcg_ctl;
4352 data = vcpu->arch.mcg_status;
4354 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4355 last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
4361 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
4362 last_msr + 1 - MSR_IA32_MC0_CTL2);
4363 data = vcpu->arch.mci_ctl2_banks[offset];
4365 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4366 last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
4370 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
4371 last_msr + 1 - MSR_IA32_MC0_CTL);
4372 data = vcpu->arch.mce_banks[offset];
4383 switch (msr_info->index) {
4406 * so for existing CPU-specific MSRs.
4413 msr_info->data = 0;
4419 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
4421 msr_info->data = 0;
4424 msr_info->data = vcpu->arch.microcode_version;
4429 msr_info->data = vcpu->arch.arch_capabilities;
4434 msr_info->data = vcpu->arch.perf_capabilities;
4437 msr_info->data = vcpu->arch.msr_ia32_power_ctl;
4446 * return L1's TSC value to ensure backwards-compatible
4451 if (msr_info->host_initiated) {
4452 offset = vcpu->arch.l1_tsc_offset;
4453 ratio = vcpu->arch.l1_tsc_scaling_ratio;
4455 offset = vcpu->arch.tsc_offset;
4456 ratio = vcpu->arch.tsc_scaling_ratio;
4459 msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
4463 msr_info->data = vcpu->arch.pat;
4468 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
4470 msr_info->data = 3;
4484 msr_info->data = 1 << 24;
4487 msr_info->data = vcpu->arch.apic_base;
4490 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
4492 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
4495 msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
4498 msr_info->data = vcpu->arch.ia32_misc_enable_msr;
4501 if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
4503 msr_info->data = vcpu->arch.smbase;
4506 msr_info->data = vcpu->arch.smi_count;
4510 msr_info->data = 1000ULL;
4512 msr_info->data |= (((uint64_t)4ULL) << 40);
4515 msr_info->data = vcpu->arch.efer;
4521 msr_info->data = vcpu->kvm->arch.wall_clock;
4527 msr_info->data = vcpu->kvm->arch.wall_clock;
4533 msr_info->data = vcpu->arch.time;
4539 msr_info->data = vcpu->arch.time;
4545 msr_info->data = vcpu->arch.apf.msr_en_val;
4551 msr_info->data = vcpu->arch.apf.msr_int_val;
4557 msr_info->data = 0;
4563 msr_info->data = vcpu->arch.st.msr_val;
4569 msr_info->data = vcpu->arch.pv_eoi.msr_val;
4575 msr_info->data = vcpu->arch.msr_kvm_poll_control;
4582 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4583 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4584 return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
4585 msr_info->host_initiated);
4587 if (!msr_info->host_initiated &&
4590 msr_info->data = vcpu->arch.ia32_xss;
4594 * Provide expected ramp-up count for K7. All other
4602 msr_info->data = 0x20000000;
4616 msr_info->index, &msr_info->data,
4617 msr_info->host_initiated);
4630 msr_info->data = 0xbe702111;
4635 msr_info->data = vcpu->arch.osvw.length;
4640 msr_info->data = vcpu->arch.osvw.status;
4643 if (!msr_info->host_initiated &&
4644 !vcpu->kvm->arch.guest_can_read_msr_platform_info)
4646 msr_info->data = vcpu->arch.msr_platform_info;
4649 msr_info->data = vcpu->arch.msr_misc_features_enables;
4652 msr_info->data = vcpu->arch.msr_hwcr;
4656 if (!msr_info->host_initiated &&
4660 msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
4663 if (!msr_info->host_initiated &&
4667 msr_info->data = vcpu->arch.guest_fpu.xfd_err;
4675 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
4697 for (i = 0; i < msrs->nmsrs; ++i) {
4699 * If userspace is accessing one or more XSTATE-managed MSRs,
4732 r = -EFAULT;
4736 r = -E2BIG;
4741 entries = memdup_user(user_msrs->entries, size);
4749 if (writeback && copy_to_user(user_msrs->entries, entries, size))
4750 r = -EFAULT;
4788 r = -EFAULT;
4792 r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
4796 r = -EFAULT;
4811 return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS;
4940 * so do not report SMM to be available if real mode is
4941 * emulated via vm86 mode. Still, do not go to great lengths
4954 r = kvm->max_vcpus;
4976 r = kvm_x86_ops.nested_ops->get_state ?
4977 kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
4984 r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
5029 if (attr->group) {
5031 return kvm_x86_call(dev_get_attr)(attr->group, attr->attr, val);
5032 return -ENXIO;
5035 switch (attr->attr) {
5040 return -ENXIO;
5046 u64 __user *uaddr = u64_to_user_ptr(attr->addr);
5055 return -EFAULT;
5079 r = -EFAULT;
5086 r = -E2BIG;
5089 r = -EFAULT;
5090 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
5093 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
5105 r = -EFAULT;
5109 r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
5114 r = -EFAULT;
5121 r = -EFAULT;
5132 r = -EFAULT;
5139 r = -E2BIG;
5142 r = -EFAULT;
5143 if (copy_to_user(user_msr_list->indices, &msr_based_features,
5159 r = -EFAULT;
5167 r = -EFAULT;
5174 r = -EINVAL;
5183 return kvm_arch_has_noncoherent_dma(vcpu->kvm);
5192 vcpu->arch.l1tf_flush_l1d = true;
5194 if (vcpu->scheduled_out && pmu->version && pmu->event_count) {
5195 pmu->need_cleanup = true;
5202 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
5203 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
5204 wbinvd_on_cpu(vcpu->cpu);
5215 * is handled on the nested VM-Exit path.
5223 vcpu->arch.host_pkru = read_pkru();
5226 if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
5227 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
5228 vcpu->arch.tsc_offset_adjustment = 0;
5232 if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
5233 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
5234 rdtsc() - vcpu->arch.last_host_tsc;
5240 vcpu->arch.last_guest_tsc);
5242 if (!vcpu->arch.guest_tsc_protected)
5243 vcpu->arch.tsc_catchup = 1;
5251 * kvmclock on vcpu->cpu migration
5253 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
5255 if (vcpu->cpu != cpu)
5257 vcpu->cpu = cpu;
5265 struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
5269 gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
5272 * The vCPU can be marked preempted if and only if the VM-Exit was on
5276 * preempted if and only if the VM-Exit was due to a host interrupt.
5278 if (!vcpu->arch.at_instruction_boundary) {
5279 vcpu->stat.preemption_other++;
5283 vcpu->stat.preemption_reported++;
5284 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
5287 if (vcpu->arch.st.preempted)
5291 if (unlikely(current->mm != vcpu->kvm->mm))
5294 slots = kvm_memslots(vcpu->kvm);
5296 if (unlikely(slots->generation != ghc->generation ||
5297 gpa != ghc->gpa ||
5298 kvm_is_error_hva(ghc->hva) || !ghc->memslot))
5301 st = (struct kvm_steal_time __user *)ghc->hva;
5302 BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
5304 if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
5305 vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
5307 mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
5314 if (vcpu->preempted) {
5316 * Assume protected guests are in-kernel. Inefficient yielding
5320 vcpu->arch.preempted_in_kernel = vcpu->arch.guest_state_protected ||
5327 idx = srcu_read_lock(&vcpu->kvm->srcu);
5328 if (kvm_xen_msr_enabled(vcpu->kvm))
5332 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5336 vcpu->arch.last_host_tsc = rdtsc();
5342 if (vcpu->arch.apic->guest_apic_protected)
5343 return -EINVAL;
5355 if (vcpu->arch.apic->guest_apic_protected)
5356 return -EINVAL;
5389 * instruction boundary and with no events half-injected.
5400 if (irq->irq >= KVM_NR_INTERRUPTS)
5401 return -EINVAL;
5403 if (!irqchip_in_kernel(vcpu->kvm)) {
5404 kvm_queue_interrupt(vcpu, irq->irq, false);
5410 * With in-kernel LAPIC, we only use this to inject EXTINT, so
5411 * fail for in-kernel 8259.
5413 if (pic_in_kernel(vcpu->kvm))
5414 return -ENXIO;
5416 if (vcpu->arch.pending_external_vector != -1)
5417 return -EEXIST;
5419 vcpu->arch.pending_external_vector = irq->irq;
5434 if (tac->flags)
5435 return -EINVAL;
5436 vcpu->arch.tpr_access_reporting = !!tac->enabled;
5446 r = -EINVAL;
5452 vcpu->arch.mcg_cap = mcg_cap;
5455 vcpu->arch.mcg_ctl = ~(u64)0;
5458 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
5460 vcpu->arch.mci_ctl2_banks[bank] = 0;
5473 * - none of the bits for Machine Check Exceptions are set
5474 * - both the VAL (valid) and UC (uncorrectable) bits are set
5475 * MCI_STATUS_PCC - Processor Context Corrupted
5476 * MCI_STATUS_S - Signaled as a Machine Check Exception
5477 * MCI_STATUS_AR - Software recoverable Action Required
5481 return !mce->mcg_status &&
5482 !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) &&
5483 (mce->status & MCI_STATUS_VAL) &&
5484 (mce->status & MCI_STATUS_UC);
5489 u64 mcg_cap = vcpu->arch.mcg_cap;
5491 banks[1] = mce->status;
5492 banks[2] = mce->addr;
5493 banks[3] = mce->misc;
5494 vcpu->arch.mcg_status = mce->mcg_status;
5497 !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))
5501 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
5509 u64 mcg_cap = vcpu->arch.mcg_cap;
5511 u64 *banks = vcpu->arch.mce_banks;
5513 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
5514 return -EINVAL;
5516 banks += array_index_nospec(4 * mce->bank, 4 * bank_num);
5525 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
5526 vcpu->arch.mcg_ctl != ~(u64)0)
5532 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
5534 if (mce->status & MCI_STATUS_UC) {
5535 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
5541 mce->status |= MCI_STATUS_OVER;
5542 banks[2] = mce->addr;
5543 banks[3] = mce->misc;
5544 vcpu->arch.mcg_status = mce->mcg_status;
5545 banks[1] = mce->status;
5550 mce->status |= MCI_STATUS_OVER;
5551 banks[2] = mce->addr;
5552 banks[3] = mce->misc;
5553 banks[1] = mce->status;
5574 * non-exiting _injected_ exception, and a pending exiting exception.
5575 * In that case, ignore the VM-Exiting exception as it's an extension
5578 if (vcpu->arch.exception_vmexit.pending &&
5579 !vcpu->arch.exception.pending &&
5580 !vcpu->arch.exception.injected)
5581 ex = &vcpu->arch.exception_vmexit;
5583 ex = &vcpu->arch.exception;
5586 * In guest mode, payload delivery should be deferred if the exception
5588 * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability,
5593 if (!vcpu->kvm->arch.exception_payload_enabled &&
5594 ex->pending && ex->has_payload)
5605 if (!kvm_exception_is_soft(ex->vector)) {
5606 events->exception.injected = ex->injected;
5607 events->exception.pending = ex->pending;
5613 if (!vcpu->kvm->arch.exception_payload_enabled)
5614 events->exception.injected |= ex->pending;
5616 events->exception.nr = ex->vector;
5617 events->exception.has_error_code = ex->has_error_code;
5618 events->exception.error_code = ex->error_code;
5619 events->exception_has_payload = ex->has_payload;
5620 events->exception_payload = ex->payload;
5622 events->interrupt.injected =
5623 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
5624 events->interrupt.nr = vcpu->arch.interrupt.nr;
5625 events->interrupt.shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
5627 events->nmi.injected = vcpu->arch.nmi_injected;
5628 events->nmi.pending = kvm_get_nr_pending_nmis(vcpu);
5629 events->nmi.masked = kvm_x86_call(get_nmi_mask)(vcpu);
5631 /* events->sipi_vector is never valid when reporting to user space */
5634 events->smi.smm = is_smm(vcpu);
5635 events->smi.pending = vcpu->arch.smi_pending;
5636 events->smi.smm_inside_nmi =
5637 !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
5639 events->smi.latched_init = kvm_lapic_latched_init(vcpu);
5641 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
5644 if (vcpu->kvm->arch.exception_payload_enabled)
5645 events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
5646 if (vcpu->kvm->arch.triple_fault_event) {
5647 events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5648 events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
5655 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
5661 return -EINVAL;
5663 if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
5664 if (!vcpu->kvm->arch.exception_payload_enabled)
5665 return -EINVAL;
5666 if (events->exception.pending)
5667 events->exception.injected = 0;
5669 events->exception_has_payload = 0;
5671 events->exception.pending = 0;
5672 events->exception_has_payload = 0;
5675 if ((events->exception.injected || events->exception.pending) &&
5676 (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
5677 return -EINVAL;
5683 * morph the exception to a VM-Exit if appropriate. Do this only for
5684 * pending exceptions, already-injected exceptions are not subject to
5687 * pending exception, which in turn may cause a spurious VM-Exit.
5689 vcpu->arch.exception_from_userspace = events->exception.pending;
5691 vcpu->arch.exception_vmexit.pending = false;
5693 vcpu->arch.exception.injected = events->exception.injected;
5694 vcpu->arch.exception.pending = events->exception.pending;
5695 vcpu->arch.exception.vector = events->exception.nr;
5696 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
5697 vcpu->arch.exception.error_code = events->exception.error_code;
5698 vcpu->arch.exception.has_payload = events->exception_has_payload;
5699 vcpu->arch.exception.payload = events->exception_payload;
5701 vcpu->arch.interrupt.injected = events->interrupt.injected;
5702 vcpu->arch.interrupt.nr = events->interrupt.nr;
5703 vcpu->arch.interrupt.soft = events->interrupt.soft;
5704 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
5706 events->interrupt.shadow);
5708 vcpu->arch.nmi_injected = events->nmi.injected;
5709 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
5710 vcpu->arch.nmi_pending = 0;
5711 atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
5712 if (events->nmi.pending)
5715 kvm_x86_call(set_nmi_mask)(vcpu, events->nmi.masked);
5717 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
5719 vcpu->arch.apic->sipi_vector = events->sipi_vector;
5721 if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
5723 if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
5725 kvm_smm_changed(vcpu, events->smi.smm);
5728 vcpu->arch.smi_pending = events->smi.pending;
5730 if (events->smi.smm) {
5731 if (events->smi.smm_inside_nmi)
5732 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
5734 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
5738 if (events->smi.smm || events->smi.pending ||
5739 events->smi.smm_inside_nmi)
5740 return -EINVAL;
5744 if (events->smi.latched_init)
5745 set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
5747 clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
5751 if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
5752 if (!vcpu->kvm->arch.triple_fault_event)
5753 return -EINVAL;
5754 if (events->triple_fault.pending)
5770 if (vcpu->kvm->arch.has_protected_state &&
5771 vcpu->arch.guest_state_protected)
5772 return -EINVAL;
5776 BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
5777 for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
5778 dbgregs->db[i] = vcpu->arch.db[i];
5780 dbgregs->dr6 = vcpu->arch.dr6;
5781 dbgregs->dr7 = vcpu->arch.dr7;
5790 if (vcpu->kvm->arch.has_protected_state &&
5791 vcpu->arch.guest_state_protected)
5792 return -EINVAL;
5794 if (dbgregs->flags)
5795 return -EINVAL;
5797 if (!kvm_dr6_valid(dbgregs->dr6))
5798 return -EINVAL;
5799 if (!kvm_dr7_valid(dbgregs->dr7))
5800 return -EINVAL;
5802 for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
5803 vcpu->arch.db[i] = dbgregs->db[i];
5806 vcpu->arch.dr6 = dbgregs->dr6;
5807 vcpu->arch.dr7 = dbgregs->dr7;
5829 u64 supported_xcr0 = vcpu->arch.guest_supported_xcr0 |
5832 if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
5833 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
5835 fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
5836 supported_xcr0, vcpu->arch.pkru);
5843 return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
5844 sizeof(guest_xsave->region));
5850 if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
5851 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
5853 return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
5854 guest_xsave->region,
5856 &vcpu->arch.pkru);
5862 if (vcpu->kvm->arch.has_protected_state &&
5863 vcpu->arch.guest_state_protected)
5864 return -EINVAL;
5867 guest_xcrs->nr_xcrs = 0;
5871 guest_xcrs->nr_xcrs = 1;
5872 guest_xcrs->flags = 0;
5873 guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
5874 guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
5883 if (vcpu->kvm->arch.has_protected_state &&
5884 vcpu->arch.guest_state_protected)
5885 return -EINVAL;
5888 return -EINVAL;
5890 if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
5891 return -EINVAL;
5893 for (i = 0; i < guest_xcrs->nr_xcrs; i++)
5895 if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
5897 guest_xcrs->xcrs[i].value);
5901 r = -EINVAL;
5913 if (!vcpu->arch.pv_time.active)
5914 return -EINVAL;
5915 vcpu->arch.pvclock_set_guest_stopped_request = true;
5925 switch (attr->attr) {
5930 r = -ENXIO;
5939 u64 __user *uaddr = u64_to_user_ptr(attr->addr);
5942 switch (attr->attr) {
5944 r = -EFAULT;
5945 if (put_user(vcpu->arch.l1_tsc_offset, uaddr))
5950 r = -ENXIO;
5959 u64 __user *uaddr = u64_to_user_ptr(attr->addr);
5960 struct kvm *kvm = vcpu->kvm;
5963 switch (attr->attr) {
5969 r = -EFAULT;
5973 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
5975 matched = (vcpu->arch.virtual_tsc_khz &&
5976 kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
5977 kvm->arch.last_tsc_offset == offset);
5979 tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
5983 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
5989 r = -ENXIO;
6003 return -EFAULT;
6006 return -ENXIO;
6026 if (cap->flags)
6027 return -EINVAL;
6029 switch (cap->cap) {
6032 if (cap->args[0])
6033 return -EINVAL;
6037 if (!irqchip_in_kernel(vcpu->kvm))
6038 return -EINVAL;
6039 return kvm_hv_activate_synic(vcpu, cap->cap ==
6047 if (!kvm_x86_ops.nested_ops->enable_evmcs)
6048 return -ENOTTY;
6049 r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
6051 user_ptr = (void __user *)(uintptr_t)cap->args[0];
6054 r = -EFAULT;
6060 return -ENOTTY;
6065 return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
6069 vcpu->arch.pv_cpuid.enforce = cap->args[0];
6072 return -EINVAL;
6088 switch (reg->index) {
6091 * FIXME: If host-initiated accesses are ever exempted from
6097 return -EINVAL;
6099 reg->type = KVM_X86_REG_TYPE_MSR;
6100 reg->index = MSR_KVM_INTERNAL_GUEST_SSP;
6103 return -EINVAL;
6113 return -EINVAL;
6116 return -EFAULT;
6126 return -EFAULT;
6129 return -EINVAL;
6144 return -EFAULT;
6147 return -EINVAL;
6150 if (reg->rsvd1 || reg->rsvd2)
6151 return -EINVAL;
6153 if (reg->type == KVM_X86_REG_TYPE_KVM) {
6159 if (reg->type != KVM_X86_REG_TYPE_MSR)
6160 return -EINVAL;
6163 return -EINVAL;
6165 guard(srcu)(&vcpu->kvm->srcu);
6167 load_fpu = is_xstate_managed_msr(vcpu, reg->index);
6173 r = kvm_get_one_msr(vcpu, reg->index, user_val);
6175 r = kvm_set_one_msr(vcpu, reg->index, user_val);
6188 if (get_user(user_nr_regs, &user_list->n))
6189 return -EFAULT;
6191 if (put_user(nr_regs, &user_list->n))
6192 return -EFAULT;
6195 return -E2BIG;
6198 put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0]))
6199 return -EFAULT;
6207 struct kvm_vcpu *vcpu = filp->private_data;
6223 r = -EINVAL;
6228 r = -ENOMEM;
6234 r = -EFAULT;
6241 r = -EINVAL;
6256 r = -EFAULT;
6274 r = -EFAULT;
6277 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
6284 r = -EFAULT;
6288 cpuid_arg->entries);
6295 r = -EFAULT;
6299 cpuid_arg->entries);
6302 r = -EFAULT;
6309 int idx = srcu_read_lock(&vcpu->kvm->srcu);
6311 srcu_read_unlock(&vcpu->kvm->srcu, idx);
6315 int idx = srcu_read_lock(&vcpu->kvm->srcu);
6317 srcu_read_unlock(&vcpu->kvm->srcu, idx);
6330 r = -EFAULT;
6336 r = -EFAULT;
6346 r = -EINVAL;
6349 r = -EFAULT;
6352 idx = srcu_read_lock(&vcpu->kvm->srcu);
6354 srcu_read_unlock(&vcpu->kvm->srcu, idx);
6360 r = -EFAULT;
6369 r = -EFAULT;
6380 r = -EFAULT;
6389 r = -EFAULT;
6405 r = -EFAULT;
6415 r = -EFAULT;
6424 r = -EINVAL;
6425 if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
6429 r = -ENOMEM;
6437 r = -EFAULT;
6444 int size = vcpu->arch.guest_fpu.uabi_size;
6457 int size = vcpu->arch.guest_fpu.uabi_size;
6460 r = -ENOMEM;
6468 r = -EFAULT;
6478 r = -ENOMEM;
6486 r = -EFAULT;
6506 r = -EINVAL;
6508 if (vcpu->arch.guest_tsc_protected)
6526 r = vcpu->arch.virtual_tsc_khz;
6536 r = -EFAULT;
6546 r = -EINVAL;
6547 if (!kvm_x86_ops.nested_ops->get_state)
6550 BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
6551 r = -EFAULT;
6552 if (get_user(user_data_size, &user_kvm_nested_state->size))
6555 r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
6561 if (put_user(r, &user_kvm_nested_state->size))
6562 r = -EFAULT;
6564 r = -E2BIG;
6576 r = -EINVAL;
6577 if (!kvm_x86_ops.nested_ops->set_state)
6580 r = -EFAULT;
6584 r = -EINVAL;
6599 idx = srcu_read_lock(&vcpu->kvm->srcu);
6600 r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
6601 srcu_read_unlock(&vcpu->kvm->srcu, idx);
6613 r = -EFAULT;
6618 r = -EFAULT;
6624 r = -EFAULT;
6632 r = -EINVAL;
6633 if (vcpu->kvm->arch.has_protected_state &&
6634 vcpu->arch.guest_state_protected)
6638 r = -ENOMEM;
6642 r = -EFAULT;
6649 r = -EINVAL;
6650 if (vcpu->kvm->arch.has_protected_state &&
6651 vcpu->arch.guest_state_protected)
6669 r = -ENOTTY;
6675 r = -EINVAL;
6693 if (addr > (unsigned int)(-3 * PAGE_SIZE))
6694 return -EINVAL;
6709 return -EINVAL;
6711 mutex_lock(&kvm->slots_lock);
6714 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
6716 mutex_unlock(&kvm->slots_lock);
6726 * on all VM-Exits, thus we only need to kick running vCPUs to force a
6727 * VM-Exit.
6732 if (!kvm->arch.cpu_dirty_log_size)
6744 if (cap->flags)
6745 return -EINVAL;
6747 switch (cap->cap) {
6749 r = -EINVAL;
6750 if (cap->args[0] & ~kvm_caps.supported_quirks)
6754 kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks;
6758 mutex_lock(&kvm->lock);
6759 r = -EINVAL;
6760 if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
6762 r = -EEXIST;
6765 if (kvm->created_vcpus)
6769 kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
6770 kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
6774 mutex_unlock(&kvm->lock);
6778 r = -EINVAL;
6779 if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
6782 if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
6783 kvm->arch.x2apic_format = true;
6784 if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
6785 kvm->arch.x2apic_broadcast_quirk_disabled = true;
6790 r = -EINVAL;
6791 if (cap->args[0] & ~kvm_get_allowed_disable_exits())
6794 mutex_lock(&kvm->lock);
6795 if (kvm->created_vcpus)
6798 #define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \
6803 (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE |
6807 kvm_disable_exits(kvm, cap->args[0]);
6810 mutex_unlock(&kvm->lock);
6813 kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
6817 kvm->arch.exception_payload_enabled = cap->args[0];
6821 kvm->arch.triple_fault_event = cap->args[0];
6825 r = -EINVAL;
6826 if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK)
6828 kvm->arch.user_space_msr_mask = cap->args[0];
6832 r = -EINVAL;
6833 if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE)
6836 if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) &&
6837 (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
6841 cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
6842 kvm->arch.bus_lock_detection_enabled = true;
6849 r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
6856 kvm->arch.sgx_provisioning_allowed = true;
6858 r = -EINVAL;
6863 r = -EINVAL;
6867 r = kvm_x86_call(vm_copy_enc_context_from)(kvm, cap->args[0]);
6870 r = -EINVAL;
6874 r = kvm_x86_call(vm_move_enc_context_from)(kvm, cap->args[0]);
6877 if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
6878 r = -EINVAL;
6881 kvm->arch.hypercall_exit_enabled = cap->args[0];
6885 r = -EINVAL;
6886 if (cap->args[0] & ~1)
6888 kvm->arch.exit_on_emulation_error = cap->args[0];
6892 r = -EINVAL;
6893 if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK))
6896 mutex_lock(&kvm->lock);
6897 if (!kvm->created_vcpus) {
6898 kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
6901 mutex_unlock(&kvm->lock);
6904 r = -EINVAL;
6905 if (cap->args[0] > KVM_MAX_VCPU_IDS)
6908 mutex_lock(&kvm->lock);
6909 if (kvm->arch.bsp_vcpu_id > cap->args[0]) {
6911 } else if (kvm->arch.max_vcpu_ids == cap->args[0]) {
6913 } else if (!kvm->arch.max_vcpu_ids) {
6914 kvm->arch.max_vcpu_ids = cap->args[0];
6917 mutex_unlock(&kvm->lock);
6920 r = -EINVAL;
6921 if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS)
6925 if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED))
6927 mutex_lock(&kvm->lock);
6928 if (!kvm->created_vcpus) {
6929 kvm->arch.notify_window = cap->args[0] >> 32;
6930 kvm->arch.notify_vmexit_flags = (u32)cap->args[0];
6933 mutex_unlock(&kvm->lock);
6936 r = -EINVAL;
6950 r = -EPERM;
6954 if (cap->args[0])
6957 mutex_lock(&kvm->lock);
6958 if (!kvm->created_vcpus) {
6959 kvm->arch.disable_nx_huge_pages = true;
6962 mutex_unlock(&kvm->lock);
6965 u64 bus_cycle_ns = cap->args[0];
6972 r = -EINVAL;
6978 mutex_lock(&kvm->lock);
6980 r = -ENXIO;
6981 else if (kvm->created_vcpus)
6982 r = -EINVAL;
6984 kvm->arch.apic_bus_cycle_ns = bus_cycle_ns;
6985 mutex_unlock(&kvm->lock);
6989 r = -EINVAL;
7003 msr_filter->default_allow = default_allow;
7014 for (i = 0; i < msr_filter->count; i++)
7015 kfree(msr_filter->ranges[i].bitmap);
7026 if (!user_range->nmsrs)
7029 if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK)
7030 return -EINVAL;
7032 if (!user_range->flags)
7033 return -EINVAL;
7035 bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
7037 return -EINVAL;
7039 bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
7043 msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
7044 .flags = user_range->flags,
7045 .base = user_range->base,
7046 .nmsrs = user_range->nmsrs,
7050 msr_filter->count++;
7063 if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
7064 return -EINVAL;
7066 for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
7067 empty &= !filter->ranges[i].nmsrs;
7069 default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
7071 return -EINVAL;
7075 return -ENOMEM;
7077 for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
7078 r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
7085 mutex_lock(&kvm->lock);
7086 old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter,
7087 mutex_is_locked(&kvm->lock));
7088 mutex_unlock(&kvm->lock);
7089 synchronize_srcu(&kvm->srcu);
7122 struct kvm *kvm = filp->private_data;
7123 long r = -ENOTTY;
7134 return -EFAULT;
7142 .flags = cr->flags,
7143 .nmsrs = cr->nmsrs,
7144 .base = cr->base,
7145 .bitmap = (__u8 *)(ulong)cr->bitmap,
7192 return -EFAULT;
7199 struct kvm_arch *ka = &kvm->arch;
7204 return -EFAULT;
7211 return -EINVAL;
7231 data.clock += now_real_ns - data.realtime;
7234 if (ka->use_master_clock)
7235 now_raw_ns = ka->master_kernel_ns;
7238 ka->kvmclock_offset = data.clock - now_raw_ns;
7245 struct kvm *kvm = filp->private_data;
7247 int r = -ENOTTY;
7251 * This union makes it completely explicit to gcc-3.x
7269 mutex_lock(&kvm->lock);
7270 r = -EINVAL;
7271 if (kvm->created_vcpus)
7273 r = -EFAULT;
7278 mutex_unlock(&kvm->lock);
7286 mutex_lock(&kvm->lock);
7288 r = -EEXIST;
7293 * Disallow an in-kernel I/O APIC if the VM has protected EOIs,
7295 * emulate level-triggered interrupts.
7297 r = -ENOTTY;
7298 if (kvm->arch.has_protected_eoi)
7301 r = -EINVAL;
7302 if (kvm->created_vcpus)
7321 /* Write kvm->irq_routing before enabling irqchip_in_kernel. */
7323 kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
7326 mutex_unlock(&kvm->lock);
7333 r = -EFAULT;
7338 mutex_lock(&kvm->lock);
7339 r = -EEXIST;
7340 if (kvm->arch.vpit)
7342 r = -ENOENT;
7345 r = -ENOMEM;
7346 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
7347 if (kvm->arch.vpit)
7350 mutex_unlock(&kvm->lock);
7362 r = -ENXIO;
7368 r = -EFAULT;
7386 r = -ENXIO;
7395 r = -EFAULT;
7398 r = -ENXIO;
7399 if (!kvm->arch.vpit)
7404 r = -EFAULT;
7411 r = -EFAULT;
7414 mutex_lock(&kvm->lock);
7415 r = -ENXIO;
7416 if (!kvm->arch.vpit)
7420 mutex_unlock(&kvm->lock);
7424 r = -ENXIO;
7425 if (!kvm->arch.vpit)
7430 r = -EFAULT;
7437 r = -EFAULT;
7440 mutex_lock(&kvm->lock);
7441 r = -ENXIO;
7442 if (!kvm->arch.vpit)
7446 mutex_unlock(&kvm->lock);
7451 r = -EFAULT;
7454 r = -ENXIO;
7455 if (!kvm->arch.vpit)
7463 mutex_lock(&kvm->lock);
7464 if (kvm->created_vcpus)
7465 r = -EBUSY;
7467 (kvm->arch.max_vcpu_ids && arg > kvm->arch.max_vcpu_ids))
7468 r = -EINVAL;
7470 kvm->arch.bsp_vcpu_id = arg;
7471 mutex_unlock(&kvm->lock);
7476 r = -EFAULT;
7485 r = -EFAULT;
7490 r = -EFAULT;
7496 r = -EFAULT;
7505 r = -EFAULT;
7521 r = -EINVAL;
7531 mutex_lock(&kvm->lock);
7532 if (!kvm->created_vcpus) {
7533 WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
7536 mutex_unlock(&kvm->lock);
7540 r = READ_ONCE(kvm->arch.default_tsc_khz);
7544 r = -ENOTTY;
7553 r = -EFAULT;
7557 r = -ENOTTY;
7567 r = -EFAULT;
7571 r = -ENOTTY;
7582 r = -EFAULT;
7597 return -EFAULT;
7603 r = -ENOTTY;
7663 (msr_index - MSR_IA32_RTIT_ADDR0_A >=
7668 MSR_ARCH_PERFMON_PERFCTR0 + KVM_MAX_NR_GP_COUNTERS - 1:
7669 if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
7674 MSR_ARCH_PERFMON_EVENTSEL0 + KVM_MAX_NR_GP_COUNTERS - 1:
7675 if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
7680 MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_MAX_NR_FIXED_COUNTERS - 1:
7681 if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
7769 !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
7774 len -= n;
7789 !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
7796 len -= n;
7818 struct kvm_mmu *mmu = vcpu->arch.mmu;
7823 /* NPT walks are always user-walks */
7825 t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
7833 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7836 return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
7843 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7847 return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
7855 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7857 return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception);
7864 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7869 gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
7870 unsigned offset = addr & (PAGE_SIZE-1);
7871 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
7883 bytes -= toread;
7897 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7903 gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
7908 offset = addr & (PAGE_SIZE-1);
7910 bytes = (unsigned)PAGE_SIZE - offset;
7956 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7961 gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
7962 unsigned offset = addr & (PAGE_SIZE-1);
7963 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
7974 bytes -= towrite;
8002 vcpu->arch.l1tf_flush_l1d = true;
8062 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
8072 !permission_fault(vcpu, vcpu->arch.walk_mmu,
8073 vcpu->arch.mmio_access, 0, access))) {
8074 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
8075 (gva & (PAGE_SIZE - 1));
8080 *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
8083 return -1;
8114 if (vcpu->mmio_read_completed) {
8116 vcpu->mmio_fragments[0].gpa, val);
8117 vcpu->mmio_read_completed = 0;
8152 struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
8154 memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
8180 bool write = ops->write;
8182 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8191 if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
8192 (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
8193 gpa = ctxt->gpa_val;
8201 if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
8207 handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
8212 bytes -= handled;
8215 WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
8216 frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
8217 frag->gpa = gpa;
8218 frag->data = val;
8219 frag->len = bytes;
8233 if (ops->read_write_prepare &&
8234 ops->read_write_prepare(vcpu, val, bytes))
8237 vcpu->mmio_nr_fragments = 0;
8240 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
8243 now = -addr & ~PAGE_MASK;
8250 if (ctxt->mode != X86EMUL_MODE_PROT64)
8253 bytes -= now;
8261 if (!vcpu->mmio_nr_fragments)
8264 gpa = vcpu->mmio_fragments[0].gpa;
8266 vcpu->mmio_needed = 1;
8267 vcpu->mmio_cur_fragment = 0;
8269 vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
8270 vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
8271 vcpu->run->exit_reason = KVM_EXIT_MMIO;
8272 vcpu->run->mmio.phys_addr = gpa;
8274 return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
8314 if (bytes > 8 || (bytes & (bytes - 1)))
8328 page_line_mask = ~(cache_line_size() - 1);
8332 if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
8390 WARN_ON_ONCE(vcpu->arch.pio.count);
8406 memset(data, 0, size * (count - i));
8415 vcpu->arch.pio.port = port;
8416 vcpu->arch.pio.in = in;
8417 vcpu->arch.pio.count = count;
8418 vcpu->arch.pio.size = size;
8421 memset(vcpu->arch.pio_data, 0, size * count);
8423 memcpy(vcpu->arch.pio_data, data, size * count);
8425 vcpu->run->exit_reason = KVM_EXIT_IO;
8426 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
8427 vcpu->run->io.size = size;
8428 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
8429 vcpu->run->io.count = count;
8430 vcpu->run->io.port = port;
8446 int size = vcpu->arch.pio.size;
8447 unsigned int count = vcpu->arch.pio.count;
8448 memcpy(val, vcpu->arch.pio_data, size * count);
8449 trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data);
8450 vcpu->arch.pio.count = 0;
8458 if (vcpu->arch.pio.count) {
8506 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
8507 wbinvd_on_cpus_mask(vcpu->arch.wbinvd_dirty_mask);
8509 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
8543 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
8556 value = vcpu->arch.cr2;
8585 vcpu->arch.cr2 = val;
8598 res = -1;
8659 desc->type = var.type;
8660 desc->s = var.s;
8661 desc->dpl = var.dpl;
8662 desc->p = var.present;
8663 desc->avl = var.avl;
8664 desc->l = var.l;
8665 desc->d = var.db;
8666 desc->g = var.g;
8684 if (desc->g)
8686 var.type = desc->type;
8687 var.dpl = desc->dpl;
8688 var.db = desc->d;
8689 var.s = desc->s;
8690 var.l = desc->l;
8691 var.g = desc->g;
8692 var.avl = desc->avl;
8693 var.present = desc->p;
8751 * Treat emulator accesses to the current shadow stack pointer as host-
8754 * so the index is fully KVM-controlled.
8775 emul_to_vcpu(ctxt)->arch.halt_request = 1;
8783 &ctxt->exception);
8853 struct kvm *kvm = emul_to_vcpu(ctxt)->kvm;
8855 if (!kvm->vm_bugged)
8944 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8946 if (ctxt->exception.vector == PF_VECTOR)
8947 kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
8948 else if (ctxt->exception.error_code_valid)
8949 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
8950 ctxt->exception.error_code);
8952 kvm_queue_exception(vcpu, ctxt->exception.vector);
8965 ctxt->vcpu = vcpu;
8966 ctxt->ops = &emulate_ops;
8967 vcpu->arch.emulate_ctxt = ctxt;
8974 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8979 ctxt->gpa_available = false;
8980 ctxt->eflags = kvm_get_rflags(vcpu);
8981 ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
8983 ctxt->eip = kvm_rip_read(vcpu);
8984 ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
8985 (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
8989 ctxt->interruptibility = 0;
8990 ctxt->have_exception = false;
8991 ctxt->exception.vector = -1;
8992 ctxt->perm_ok = false;
8995 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
9000 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9005 ctxt->op_bytes = 2;
9006 ctxt->ad_bytes = 2;
9007 ctxt->_eip = ctxt->eip + inc_eip;
9013 ctxt->eip = ctxt->_eip;
9014 kvm_rip_write(vcpu, ctxt->eip);
9015 kvm_set_rflags(vcpu, ctxt->eflags);
9023 struct kvm_run *run = vcpu->run;
9036 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
9037 run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
9049 run->emulation_failure.flags = 0;
9052 BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
9053 sizeof(run->emulation_failure.insn_bytes) != 16));
9055 run->emulation_failure.flags |=
9057 run->emulation_failure.insn_size = insn_size;
9058 memset(run->emulation_failure.insn_bytes, 0x90,
9059 sizeof(run->emulation_failure.insn_bytes));
9060 memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
9063 memcpy(&run->internal.data[info_start], info, sizeof(info));
9064 memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
9067 run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata;
9072 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9074 prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
9075 ctxt->fetch.end - ctxt->fetch.data);
9094 struct kvm_run *run = vcpu->run;
9101 run->internal.data[ndata++] = info2;
9102 run->internal.data[ndata++] = reason;
9103 run->internal.data[ndata++] = info1;
9104 run->internal.data[ndata++] = gpa;
9105 run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
9107 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
9108 run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
9109 run->internal.ndata = ndata;
9115 struct kvm *kvm = vcpu->kvm;
9117 ++vcpu->stat.insn_emulation_fail;
9125 if (kvm->arch.exit_on_emulation_error ||
9154 * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and
9163 * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
9164 * guest to let the CPU re-execute the instruction in the hope that the
9199 struct kvm_run *kvm_run = vcpu->run;
9201 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
9202 kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW;
9203 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
9204 kvm_run->debug.arch.exception = DB_VECTOR;
9205 kvm_run->exit_reason = KVM_EXIT_DEBUG;
9275 if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
9276 (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
9277 struct kvm_run *kvm_run = vcpu->run;
9280 vcpu->arch.guest_debug_dr7,
9281 vcpu->arch.eff_db);
9284 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
9285 kvm_run->debug.arch.pc = eip;
9286 kvm_run->debug.arch.exception = DB_VECTOR;
9287 kvm_run->exit_reason = KVM_EXIT_DEBUG;
9293 if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
9297 vcpu->arch.dr7,
9298 vcpu->arch.db);
9312 switch (ctxt->opcode_len) {
9314 switch (ctxt->b) {
9331 switch (ctxt->b) {
9344 * (and wrong) when emulating on an intercepted fault-like exception[*], as
9354 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9362 ++vcpu->stat.insn_emulation;
9372 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9398 vcpu->arch.l1tf_flush_l1d = true;
9405 * are fault-like and are higher priority than any faults on
9423 if (ctxt->have_exception &&
9426 * #UD should result in just EMULATION_FAILED, and trap-like
9429 WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
9430 exception_type(ctxt->exception.vector) == EXCPT_TRAP);
9448 * injecting single-step #DBs.
9451 if (ctxt->mode != X86EMUL_MODE_PROT64)
9452 ctxt->eip = (u32)ctxt->_eip;
9454 ctxt->eip = ctxt->_eip;
9461 kvm_rip_write(vcpu, ctxt->eip);
9462 if (ctxt->eflags & X86_EFLAGS_RF)
9463 kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
9468 * If emulation was caused by a write-protection #PF on a non-page_table
9480 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
9481 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
9488 ctxt->exception.address = cr2_or_gpa;
9491 if (vcpu->arch.mmu->root_role.direct) {
9492 ctxt->gpa_available = true;
9493 ctxt->gpa_val = cr2_or_gpa;
9497 ctxt->exception.address = 0;
9502 * L2, unless KVM is re-emulating a previously decoded instruction,
9520 if (ctxt->have_exception) {
9521 WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write);
9522 vcpu->mmio_needed = false;
9525 } else if (vcpu->arch.pio.count) {
9526 if (!vcpu->arch.pio.in) {
9527 /* FIXME: return into emulator if single-stepping. */
9528 vcpu->arch.pio.count = 0;
9531 vcpu->arch.complete_userspace_io = complete_emulated_pio;
9534 } else if (vcpu->mmio_needed) {
9535 ++vcpu->stat.mmio_exits;
9537 if (!vcpu->mmio_is_write)
9540 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
9541 } else if (vcpu->arch.complete_userspace_io) {
9552 toggle_interruptibility(vcpu, ctxt->interruptibility);
9553 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
9556 * Note, EXCPT_DB is assumed to be fault-like as the emulator
9558 * of which are fault-like.
9560 if (!ctxt->have_exception ||
9561 exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
9563 if (ctxt->is_branch)
9565 kvm_rip_write(vcpu, ctxt->eip);
9566 if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
9569 __kvm_set_rflags(vcpu, ctxt->eflags);
9578 if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
9581 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
9601 vcpu->arch.pio.count = 0;
9607 vcpu->arch.pio.count = 0;
9609 if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)))
9629 kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
9630 vcpu->arch.complete_userspace_io =
9634 vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
9635 vcpu->arch.complete_userspace_io = complete_fast_pio_out;
9645 BUG_ON(vcpu->arch.pio.count != 1);
9647 if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) {
9648 vcpu->arch.pio.count = 0;
9653 val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
9676 vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
9677 vcpu->arch.complete_userspace_io = complete_fast_pio_in;
9708 khz = freq->new;
9729 /* TSC frequency always matches when on Hyper-V */
9797 if (vcpu->cpu != cpu)
9800 if (vcpu->cpu != raw_smp_processor_id())
9806 if (freq->old < freq->new && send_ipi) {
9829 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
9831 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
9834 for_each_cpu(cpu, freq->policy->cpus)
9862 if (policy->cpuinfo.max_freq)
9863 max_tsc_khz = policy->cpuinfo.max_freq;
9921 if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
9934 memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
9944 #include <asm/kvm-x86-ops.h>
9947 kvm_pmu_ops_update(ops->pmu_ops);
9965 return -EIO;
9984 return -EEXIST;
9994 return -EOPNOTSUPP;
9999 return -EOPNOTSUPP;
10012 return -EIO;
10018 * Linux doesn't yet support supervisor shadow stacks (SSS), so
10024 return -EIO;
10032 return -ENOMEM;
10038 r = -ENOMEM;
10065 kvm_init_pmu_capability(ops->pmu_ops);
10070 r = ops->hardware_setup();
10092 if (pi_inject_timer == -1)
10101 kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
10195 return -KVM_EOPNOTSUPP;
10198 * When tsc is in permanent catchup mode guests won't be able to use
10201 if (vcpu->arch.tsc_always_catchup)
10202 return -KVM_EOPNOTSUPP;
10205 return -KVM_EOPNOTSUPP;
10214 if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
10216 ret = -KVM_EFAULT;
10225 * @apicid - apicid of vcpu to be kicked.
10245 return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
10251 ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
10279 set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true);
10281 init_rwsem(&kvm->arch.apicv_update_lock);
10289 vcpu->stat.directed_yield_attempted++;
10295 map = rcu_dereference(vcpu->kvm->arch.apic_map);
10297 if (likely(map) && dest_id <= map->max_apic_id) {
10298 dest_id = array_index_nospec(dest_id, map->max_apic_id + 1);
10299 if (map->phys_map[dest_id])
10300 target = map->phys_map[dest_id]->vcpu;
10305 if (!target || !READ_ONCE(target->ready))
10315 vcpu->stat.directed_yield_successful++;
10323 u64 ret = vcpu->run->hypercall.ret;
10342 ++vcpu->stat.hypercalls;
10355 ret = -KVM_EPERM;
10359 ret = -KVM_ENOSYS;
10369 kvm_pv_kick_cpu_op(vcpu->kvm, a1);
10382 ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
10394 ret = -KVM_ENOSYS;
10395 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE))
10400 ret = -KVM_EINVAL;
10404 vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
10405 vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
10407 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
10408 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
10410 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
10412 vcpu->run->hypercall.ret = 0;
10413 vcpu->run->hypercall.args[0] = gpa;
10414 vcpu->run->hypercall.args[1] = npages;
10415 vcpu->run->hypercall.args[2] = attrs;
10416 vcpu->run->hypercall.flags = 0;
10418 vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;
10420 WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
10421 vcpu->arch.complete_userspace_io = complete_hypercall;
10425 ret = -KVM_ENOSYS;
10430 vcpu->run->hypercall.ret = ret;
10437 if (kvm_xen_hypercall_enabled(vcpu->kvm))
10458 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
10459 ctxt->exception.error_code_valid = false;
10460 ctxt->exception.vector = UD_VECTOR;
10461 ctxt->have_exception = true;
10468 &ctxt->exception);
10473 return vcpu->run->request_interrupt_window &&
10474 likely(!pic_in_kernel(vcpu->kvm));
10477 /* Called within kvm->srcu read side. */
10480 struct kvm_run *kvm_run = vcpu->run;
10482 kvm_run->if_flag = kvm_x86_call(get_if_flag)(vcpu);
10483 kvm_run->cr8 = kvm_get_cr8(vcpu);
10484 kvm_run->apic_base = vcpu->arch.apic_base;
10486 kvm_run->ready_for_interrupt_injection =
10487 pic_in_kernel(vcpu->kvm) ||
10491 kvm_run->flags |= KVM_RUN_X86_SMM;
10493 kvm_run->flags |= KVM_RUN_X86_GUEST_MODE;
10506 if (vcpu->arch.apic->apicv_active)
10509 if (!vcpu->arch.apic->vapic_addr)
10512 max_irr = -1;
10514 if (max_irr != -1)
10526 kvm_x86_ops.nested_ops->triple_fault(vcpu);
10530 return kvm_x86_ops.nested_ops->check_events(vcpu);
10536 * Suppress the error code if the vCPU is in Real Mode, as Real Mode
10539 * is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do
10540 * report an error code despite the CPU being in Real Mode.
10542 vcpu->arch.exception.has_error_code &= is_protmode(vcpu);
10544 trace_kvm_inj_exception(vcpu->arch.exception.vector,
10545 vcpu->arch.exception.has_error_code,
10546 vcpu->arch.exception.error_code,
10547 vcpu->arch.exception.injected);
10557 * injected as part of a previous VM-Enter, but weren't successfully delivered
10558 * and need to be re-injected.
10563 * also be able to re-inject NMIs and IRQs in the middle of an instruction.
10564 * I.e. for exceptions and re-injected events, NOT invoking this on instruction
10569 * instruction boundaries for asynchronous events. However, because VM-Exits
10575 * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
10598 * Process nested events first, as nested VM-Exit supersedes event
10599 * re-injection. If there's an event queued for re-injection, it will
10600 * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
10608 * Re-inject exceptions and events *especially* if immediate entry+exit
10612 * Don't re-inject an NMI or interrupt if there is a pending exception.
10621 * as the exception "occurred" before the exit to userspace. Trap-like
10623 * And while fault-like exceptions, e.g. #GP and #PF, are the lowest
10626 * Thus a pending fault-like exception means the fault occurred on the
10630 if (vcpu->arch.exception.injected)
10634 else if (vcpu->arch.nmi_injected)
10636 else if (vcpu->arch.interrupt.injected)
10640 * Exceptions that morph to VM-Exits are handled above, and pending
10641 * exceptions on top of injected exceptions that do not VM-Exit should
10644 WARN_ON_ONCE(vcpu->arch.exception.injected &&
10645 vcpu->arch.exception.pending);
10649 * nested VM-Enter or event re-injection so that a different pending
10652 * Otherwise, continue processing events even if VM-Exit occurred. The
10653 * VM-Exit will have cleared exceptions that were meant for L2, but
10660 * A pending exception VM-Exit should either result in nested VM-Exit
10661 * or force an immediate re-entry and exit to/from L2, and exception
10662 * VM-Exits cannot be injected (flag should _never_ be set).
10664 WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected ||
10665 vcpu->arch.exception_vmexit.pending);
10669 * to re-inject a previous event. See above comments on re-injecting
10674 if (vcpu->arch.exception.pending) {
10676 * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
10677 * value pushed on the stack. Trap-like exception and all #DBs
10678 * leave RF as-is (KVM follows Intel's behavior in this regard;
10683 * fault-like. They do _not_ set RF, a la code breakpoints.
10685 if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT)
10689 if (vcpu->arch.exception.vector == DB_VECTOR) {
10690 kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception);
10691 if (vcpu->arch.dr7 & DR7_GD) {
10692 vcpu->arch.dr7 &= ~DR7_GD;
10699 vcpu->arch.exception.pending = false;
10700 vcpu->arch.exception.injected = true;
10706 if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ)
10711 * due to architectural conditions (e.g. IF=0) a window-open exit
10712 * will re-request KVM_REQ_EVENT. Sometimes however an event is pending
10718 * The kvm_x86_ops hooks communicate this by returning -EBUSY.
10721 if (vcpu->arch.smi_pending) {
10723 -EBUSY;
10727 vcpu->arch.smi_pending = false;
10728 ++vcpu->arch.smi_count;
10736 if (vcpu->arch.nmi_pending) {
10738 -EBUSY;
10742 --vcpu->arch.nmi_pending;
10743 vcpu->arch.nmi_injected = true;
10748 if (vcpu->arch.nmi_pending)
10754 -EBUSY;
10760 if (!WARN_ON_ONCE(irq == -1)) {
10771 kvm_x86_ops.nested_ops->has_events &&
10772 kvm_x86_ops.nested_ops->has_events(vcpu, true))
10777 * is done emulating and should only propagate the to-be-injected event
10779 * infinite loop as KVM will bail from VM-Enter to inject the pending
10785 * Mode events (see kvm_inject_realmode_interrupt()).
10787 WARN_ON_ONCE(vcpu->arch.exception.pending ||
10788 vcpu->arch.exception_vmexit.pending);
10792 if (r == -EBUSY) {
10813 if (kvm_x86_call(get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
10820 * tracked in vcpu->arch.nmi_pending.
10823 limit--;
10825 vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
10826 vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
10828 if (vcpu->arch.nmi_pending &&
10830 vcpu->arch.nmi_pending--;
10832 if (vcpu->arch.nmi_pending)
10839 return vcpu->arch.nmi_pending +
10856 struct kvm_lapic *apic = vcpu->arch.apic;
10862 down_read(&vcpu->kvm->arch.apicv_update_lock);
10869 if (apic->apicv_active == activate)
10872 apic->apicv_active = activate;
10882 if (!apic->apicv_active)
10887 up_read(&vcpu->kvm->arch.apicv_update_lock);
10904 * despite being in x2APIC mode. For simplicity, inhibiting the APIC
10907 if (apic_x2apic_mode(vcpu->arch.apic) &&
10919 lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
10924 old = new = kvm->arch.apicv_inhibit_reasons;
10942 kvm->arch.apicv_inhibit_reasons = new;
10945 int idx = srcu_read_lock(&kvm->srcu);
10948 srcu_read_unlock(&kvm->srcu, idx);
10951 kvm->arch.apicv_inhibit_reasons = new;
10961 down_write(&kvm->arch.apicv_update_lock);
10963 up_write(&kvm->arch.apicv_update_lock);
10972 bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
10973 vcpu->arch.highest_stale_pending_ioapic_eoi = -1;
10977 if (irqchip_split(vcpu->kvm))
10978 kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
10980 else if (ioapic_in_kernel(vcpu->kvm))
10981 kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
10985 vcpu->arch.load_eoi_exitmap_pending = true;
10992 if (!kvm_apic_hw_enabled(vcpu->arch.apic))
11000 vcpu->arch.ioapic_handled_vectors,
11001 to_hv_synic(vcpu)->vec_bitmap, 256);
11007 vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
11024 * Called within kvm->srcu read side.
11042 r = -EIO;
11052 if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
11062 kvm_update_masterclock(vcpu->kvm);
11086 * Fall back to a "full" guest flush if Hyper-V's precise
11087 * flushing fails. Note, Hyper-V's flushing is per-vCPU, but
11098 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
11104 kvm_x86_ops.nested_ops->triple_fault(vcpu);
11107 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
11108 vcpu->mmio_needed = 0;
11115 vcpu->arch.apf.halted = true;
11132 BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
11133 if (test_bit(vcpu->arch.pending_ioapic_eoi,
11134 vcpu->arch.ioapic_handled_vectors)) {
11135 vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
11136 vcpu->run->eoi.vector =
11137 vcpu->arch.pending_ioapic_eoi;
11150 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
11151 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
11152 vcpu->run->system_event.ndata = 0;
11157 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
11158 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
11159 vcpu->run->system_event.ndata = 0;
11166 vcpu->run->exit_reason = KVM_EXIT_HYPERV;
11167 vcpu->run->hyperv = hv_vcpu->exit;
11174 * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
11175 * depend on the guest clock being up-to-date
11193 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) {
11202 ++vcpu->stat.req_event;
11208 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
11243 /* Store vcpu->apicv_active before vcpu->mode. */
11244 smp_store_release(&vcpu->mode, IN_GUEST_MODE);
11249 * 1) We should set ->mode before checking ->requests. Please see
11252 * 2) For APICv, we should set ->mode before checking PID.ON. This
11256 * 3) This also orders the write to mode from any reads to the page
11273 vcpu->mode = OUTSIDE_GUEST_MODE;
11292 if (vcpu->arch.guest_fpu.xfd_err)
11293 wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
11295 if (unlikely(vcpu->arch.switch_db_regs &&
11296 !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) {
11298 set_debugreg(vcpu->arch.eff_db[0], 0);
11299 set_debugreg(vcpu->arch.eff_db[1], 1);
11300 set_debugreg(vcpu->arch.eff_db[2], 2);
11301 set_debugreg(vcpu->arch.eff_db[3], 3);
11303 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
11312 * vendor code if any host-owned bits were changed, e.g. so that the
11316 if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
11317 !vcpu->arch.guest_state_protected)
11319 vcpu->arch.host_debugctl = debug_ctl;
11327 * per-VM state, and responding vCPUs must wait for the update
11347 /* Note, VM-Exits that go down the "slow" path are accounted below. */
11348 ++vcpu->stat.exits;
11357 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
11358 WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
11359 WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH);
11375 vcpu->arch.last_vmentry_cpu = vcpu->cpu;
11376 vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
11378 vcpu->mode = OUTSIDE_GUEST_MODE;
11383 * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
11386 if (vcpu->arch.xfd_no_write_intercept)
11391 if (vcpu->arch.guest_fpu.xfd_err)
11405 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
11412 ++vcpu->stat.exits;
11440 !vcpu->arch.guest_state_protected)) {
11445 if (unlikely(vcpu->arch.tsc_always_catchup))
11448 if (vcpu->arch.apic_attention)
11461 if (unlikely(vcpu->arch.apic_attention))
11469 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
11470 !vcpu->arch.apf.halted);
11475 if (!list_empty_careful(&vcpu->async_pf.done))
11486 (vcpu->arch.nmi_pending &&
11492 (vcpu->arch.smi_pending &&
11510 kvm_x86_ops.nested_ops->has_events &&
11511 kvm_x86_ops.nested_ops->has_events(vcpu, false))
11523 return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted ||
11527 /* Called within kvm->srcu read side. */
11534 * Switch to the software timer before halt-polling/blocking as
11536 * hypervisor timer runs only when the CPU is in guest mode.
11537 * Switch before halt-polling so that KVM recognizes an expired
11545 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
11566 * state field (AMD does not have a similar field and a VM-Exit always
11572 WARN_ON_ONCE(r == -EBUSY);
11579 switch(vcpu->arch.mp_state) {
11585 vcpu->arch.apf.halted = false;
11596 /* Called within kvm->srcu read side. */
11601 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
11610 vcpu->arch.at_instruction_boundary = false;
11630 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
11631 ++vcpu->stat.request_irq_exits;
11651 * local APIC is in-kernel, the run loop will detect the non-runnable
11656 ++vcpu->stat.halt_exits;
11658 if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted)
11663 vcpu->run->exit_reason = reason;
11678 * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
11714 return vcpu->arch.preempted_in_kernel;
11719 if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
11739 BUG_ON(!vcpu->arch.pio.count);
11764 struct kvm_run *run = vcpu->run;
11768 BUG_ON(!vcpu->mmio_needed);
11771 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
11772 len = min(8u, frag->len);
11773 if (!vcpu->mmio_is_write)
11774 memcpy(frag->data, run->mmio.data, len);
11776 if (frag->len <= 8) {
11779 vcpu->mmio_cur_fragment++;
11782 frag->data += len;
11783 frag->gpa += len;
11784 frag->len -= len;
11787 if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
11788 vcpu->mmio_needed = 0;
11790 /* FIXME: return into emulator if single-stepping. */
11791 if (vcpu->mmio_is_write)
11793 vcpu->mmio_read_completed = 1;
11797 run->exit_reason = KVM_EXIT_MMIO;
11798 run->mmio.phys_addr = frag->gpa;
11799 if (vcpu->mmio_is_write)
11800 memcpy(run->mmio.data, frag->data, min(8u, frag->len));
11801 run->mmio.len = min(8u, frag->len);
11802 run->mmio.is_write = vcpu->mmio_is_write;
11803 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
11810 /* Exclude PKRU, it's restored separately immediately after VM-Exit. */
11811 fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
11818 fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
11819 ++vcpu->stat.fpu_reload;
11826 * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and
11831 if (WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED))
11832 return -EINVAL;
11838 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED &&
11840 return -EINVAL;
11847 struct kvm_queued_exception *ex = &vcpu->arch.exception;
11848 struct kvm_run *kvm_run = vcpu->run;
11852 r = kvm_mmu_post_init_vm(vcpu->kvm);
11858 kvm_run->flags = 0;
11862 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
11863 if (!vcpu->wants_to_run) {
11864 r = -EINTR;
11886 r = -EAGAIN;
11888 r = -EINTR;
11889 kvm_run->exit_reason = KVM_EXIT_INTR;
11890 ++vcpu->stat.signal_exits;
11895 sync_valid_fields = kvm_sync_valid_fields(vcpu->kvm);
11896 if ((kvm_run->kvm_valid_regs & ~sync_valid_fields) ||
11897 (kvm_run->kvm_dirty_regs & ~sync_valid_fields)) {
11898 r = -EINVAL;
11902 if (kvm_run->kvm_dirty_regs) {
11908 /* re-sync apic's tpr */
11910 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
11911 r = -EINVAL;
11918 * a pending VM-Exit if L1 wants to intercept the exception.
11920 if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) &&
11921 kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector,
11922 ex->error_code)) {
11923 kvm_queue_exception_vmexit(vcpu, ex->vector,
11924 ex->has_error_code, ex->error_code,
11925 ex->has_payload, ex->payload);
11926 ex->injected = false;
11927 ex->pending = false;
11929 vcpu->arch.exception_from_userspace = false;
11931 if (unlikely(vcpu->arch.complete_userspace_io)) {
11932 int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
11933 vcpu->arch.complete_userspace_io = NULL;
11938 WARN_ON_ONCE(vcpu->arch.pio.count);
11939 WARN_ON_ONCE(vcpu->mmio_needed);
11942 if (!vcpu->wants_to_run) {
11943 r = -EINTR;
11955 if (kvm_run->kvm_valid_regs && likely(!vcpu->arch.guest_state_protected))
11967 if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
11975 emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
11976 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
11978 regs->rax = kvm_rax_read(vcpu);
11979 regs->rbx = kvm_rbx_read(vcpu);
11980 regs->rcx = kvm_rcx_read(vcpu);
11981 regs->rdx = kvm_rdx_read(vcpu);
11982 regs->rsi = kvm_rsi_read(vcpu);
11983 regs->rdi = kvm_rdi_read(vcpu);
11984 regs->rsp = kvm_rsp_read(vcpu);
11985 regs->rbp = kvm_rbp_read(vcpu);
11987 regs->r8 = kvm_r8_read(vcpu);
11988 regs->r9 = kvm_r9_read(vcpu);
11989 regs->r10 = kvm_r10_read(vcpu);
11990 regs->r11 = kvm_r11_read(vcpu);
11991 regs->r12 = kvm_r12_read(vcpu);
11992 regs->r13 = kvm_r13_read(vcpu);
11993 regs->r14 = kvm_r14_read(vcpu);
11994 regs->r15 = kvm_r15_read(vcpu);
11997 regs->rip = kvm_rip_read(vcpu);
11998 regs->rflags = kvm_get_rflags(vcpu);
12003 if (vcpu->kvm->arch.has_protected_state &&
12004 vcpu->arch.guest_state_protected)
12005 return -EINVAL;
12015 vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
12016 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
12018 kvm_rax_write(vcpu, regs->rax);
12019 kvm_rbx_write(vcpu, regs->rbx);
12020 kvm_rcx_write(vcpu, regs->rcx);
12021 kvm_rdx_write(vcpu, regs->rdx);
12022 kvm_rsi_write(vcpu, regs->rsi);
12023 kvm_rdi_write(vcpu, regs->rdi);
12024 kvm_rsp_write(vcpu, regs->rsp);
12025 kvm_rbp_write(vcpu, regs->rbp);
12027 kvm_r8_write(vcpu, regs->r8);
12028 kvm_r9_write(vcpu, regs->r9);
12029 kvm_r10_write(vcpu, regs->r10);
12030 kvm_r11_write(vcpu, regs->r11);
12031 kvm_r12_write(vcpu, regs->r12);
12032 kvm_r13_write(vcpu, regs->r13);
12033 kvm_r14_write(vcpu, regs->r14);
12034 kvm_r15_write(vcpu, regs->r15);
12037 kvm_rip_write(vcpu, regs->rip);
12038 kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
12040 vcpu->arch.exception.pending = false;
12041 vcpu->arch.exception_vmexit.pending = false;
12048 if (vcpu->kvm->arch.has_protected_state &&
12049 vcpu->arch.guest_state_protected)
12050 return -EINVAL;
12062 if (vcpu->arch.guest_state_protected)
12065 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
12066 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
12067 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
12068 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
12069 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
12070 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
12072 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
12073 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
12076 sregs->idt.limit = dt.size;
12077 sregs->idt.base = dt.address;
12079 sregs->gdt.limit = dt.size;
12080 sregs->gdt.base = dt.address;
12082 sregs->cr2 = vcpu->arch.cr2;
12083 sregs->cr3 = kvm_read_cr3(vcpu);
12086 sregs->cr0 = kvm_read_cr0(vcpu);
12087 sregs->cr4 = kvm_read_cr4(vcpu);
12088 sregs->cr8 = kvm_get_cr8(vcpu);
12089 sregs->efer = vcpu->arch.efer;
12090 sregs->apic_base = vcpu->arch.apic_base;
12097 if (vcpu->arch.guest_state_protected)
12100 if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
12101 set_bit(vcpu->arch.interrupt.nr,
12102 (unsigned long *)sregs->interrupt_bitmap);
12111 if (vcpu->arch.guest_state_protected)
12116 sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
12117 sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
12124 if (vcpu->kvm->arch.has_protected_state &&
12125 vcpu->arch.guest_state_protected)
12126 return -EINVAL;
12150 if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
12151 vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
12152 vcpu->arch.pv.pv_unhalted)
12153 mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
12155 mp_state->mp_state = vcpu->arch.mp_state;
12169 int ret = -EINVAL;
12173 switch (mp_state->mp_state) {
12192 * leaves the vCPU in INIT_RECIEVED (Wait-For-SIPI) and pends the SIPI.
12195 if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
12196 mp_state->mp_state = KVM_MP_STATE_INIT_RECEIVED;
12197 set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
12200 kvm_set_mp_state(vcpu, mp_state->mp_state);
12212 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
12219 * Check both User and Supervisor on task switches as inter-
12243 if (ret || vcpu->mmio_needed)
12246 kvm_rip_write(vcpu, ctxt->eip);
12247 kvm_set_rflags(vcpu, ctxt->eflags);
12251 vcpu->mmio_needed = false;
12252 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
12253 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
12254 vcpu->run->internal.ndata = 0;
12261 if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
12264 * 64-bit mode (though maybe in a 32-bit code segment).
12267 if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
12269 if (!kvm_vcpu_is_legal_cr3(vcpu, sregs->cr3))
12273 * Not in 64-bit mode: EFER.LMA is clear and the code
12274 * segment cannot be 64-bit.
12276 if (sregs->efer & EFER_LMA || sregs->cs.l)
12280 return kvm_is_valid_cr4(vcpu, sregs->cr4) &&
12281 kvm_is_valid_cr0(vcpu, sregs->cr0);
12291 return -EINVAL;
12293 if (kvm_apic_set_base(vcpu, sregs->apic_base, true))
12294 return -EINVAL;
12296 if (vcpu->arch.guest_state_protected)
12299 dt.size = sregs->idt.limit;
12300 dt.address = sregs->idt.base;
12302 dt.size = sregs->gdt.limit;
12303 dt.address = sregs->gdt.base;
12306 vcpu->arch.cr2 = sregs->cr2;
12307 *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
12308 vcpu->arch.cr3 = sregs->cr3;
12310 kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3);
12312 kvm_set_cr8(vcpu, sregs->cr8);
12314 *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
12315 kvm_x86_call(set_efer)(vcpu, sregs->efer);
12317 *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
12318 kvm_x86_call(set_cr0)(vcpu, sregs->cr0);
12320 *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
12321 kvm_x86_call(set_cr4)(vcpu, sregs->cr4);
12324 idx = srcu_read_lock(&vcpu->kvm->srcu);
12329 srcu_read_unlock(&vcpu->kvm->srcu, idx);
12332 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
12333 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
12334 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
12335 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
12336 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
12337 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
12339 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
12340 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
12346 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
12369 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
12382 bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
12383 bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
12384 !(sregs2->efer & EFER_LMA);
12387 if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
12388 return -EINVAL;
12390 if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
12391 return -EINVAL;
12400 kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
12404 vcpu->arch.pdptrs_from_userspace = true;
12418 if (vcpu->kvm->arch.has_protected_state &&
12419 vcpu->arch.guest_state_protected)
12420 return -EINVAL;
12437 down_write(&kvm->arch.apicv_update_lock);
12440 if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) {
12446 up_write(&kvm->arch.apicv_update_lock);
12455 if (vcpu->arch.guest_state_protected)
12456 return -EINVAL;
12460 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
12461 r = -EBUSY;
12464 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
12476 vcpu->guest_debug = dbg->control;
12477 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
12478 vcpu->guest_debug = 0;
12480 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
12482 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
12483 vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
12486 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
12490 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
12491 vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu);
12501 kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm);
12516 unsigned long vaddr = tr->linear_address;
12522 idx = srcu_read_lock(&vcpu->kvm->srcu);
12524 srcu_read_unlock(&vcpu->kvm->srcu, idx);
12525 tr->physical_address = gpa;
12526 tr->valid = gpa != INVALID_GPA;
12527 tr->writeable = 1;
12528 tr->usermode = 0;
12538 if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
12539 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
12543 fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
12544 memcpy(fpu->fpr, fxsave->st_space, 128);
12545 fpu->fcw = fxsave->cwd;
12546 fpu->fsw = fxsave->swd;
12547 fpu->ftwx = fxsave->twd;
12548 fpu->last_opcode = fxsave->fop;
12549 fpu->last_ip = fxsave->rip;
12550 fpu->last_dp = fxsave->rdp;
12551 memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
12561 if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
12562 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
12566 fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
12568 memcpy(fxsave->st_space, fpu->fpr, 128);
12569 fxsave->cwd = fpu->fcw;
12570 fxsave->swd = fpu->fsw;
12571 fxsave->twd = fpu->ftwx;
12572 fxsave->fop = fpu->last_opcode;
12573 fxsave->rip = fpu->last_ip;
12574 fxsave->rdp = fpu->last_dp;
12575 memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
12585 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
12586 __get_regs(vcpu, &vcpu->run->s.regs.regs);
12588 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
12589 __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
12591 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
12593 vcpu, &vcpu->run->s.regs.events);
12598 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
12599 __set_regs(vcpu, &vcpu->run->s.regs.regs);
12600 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
12603 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
12604 struct kvm_sregs sregs = vcpu->run->s.regs.sregs;
12607 return -EINVAL;
12609 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
12612 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
12613 struct kvm_vcpu_events events = vcpu->run->s.regs.events;
12616 return -EINVAL;
12618 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
12626 if (kvm_check_tsc_unstable() && kvm->created_vcpus)
12630 if (!kvm->arch.max_vcpu_ids)
12631 kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
12633 if (id >= kvm->arch.max_vcpu_ids)
12634 return -EINVAL;
12644 vcpu->arch.last_vmentry_cpu = -1;
12645 vcpu->arch.regs_avail = ~0;
12646 vcpu->arch.regs_dirty = ~0;
12648 kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
12650 if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
12663 r = -ENOMEM;
12668 vcpu->arch.pio_data = page_address(page);
12670 vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
12672 vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
12674 if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
12676 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
12678 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
12685 if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
12692 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
12693 vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
12694 vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
12695 vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
12699 vcpu->arch.pending_external_vector = -1;
12700 vcpu->arch.preempted_in_kernel = false;
12703 vcpu->arch.hv_root_tdp = INVALID_PAGE;
12713 kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
12720 fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
12722 kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
12724 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
12726 kfree(vcpu->arch.mce_banks);
12727 kfree(vcpu->arch.mci_ctl2_banks);
12728 free_page((unsigned long)vcpu->arch.pio_data);
12738 struct kvm *kvm = vcpu->kvm;
12740 if (mutex_lock_killable(&vcpu->mutex))
12747 vcpu->arch.msr_kvm_poll_control = 1;
12749 mutex_unlock(&vcpu->mutex);
12751 if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
12752 schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
12770 kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
12771 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
12772 fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
12777 kfree(vcpu->arch.mce_banks);
12778 kfree(vcpu->arch.mci_ctl2_banks);
12780 idx = srcu_read_lock(&vcpu->kvm->srcu);
12782 srcu_read_unlock(&vcpu->kvm->srcu, idx);
12783 free_page((unsigned long)vcpu->arch.pio_data);
12784 kvfree(vcpu->arch.cpuid_entries);
12789 struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
12830 * Several of the "set" flows, e.g. ->set_cr0(), read other registers
12840 * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's
12851 vcpu->arch.hflags = 0;
12853 vcpu->arch.smi_pending = 0;
12854 vcpu->arch.smi_count = 0;
12855 atomic_set(&vcpu->arch.nmi_queued, 0);
12856 vcpu->arch.nmi_pending = 0;
12857 vcpu->arch.nmi_injected = false;
12861 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
12863 vcpu->arch.dr6 = DR6_ACTIVE_LOW;
12864 vcpu->arch.dr7 = DR7_FIXED_1;
12867 vcpu->arch.cr2 = 0;
12870 vcpu->arch.apf.msr_en_val = 0;
12871 vcpu->arch.apf.msr_int_val = 0;
12872 vcpu->arch.st.msr_val = 0;
12878 vcpu->arch.apf.halted = false;
12883 vcpu->arch.smbase = 0x30000;
12885 vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
12887 vcpu->arch.msr_misc_features_enables = 0;
12888 vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
12896 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
12907 kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
12914 vcpu->arch.cr3 = 0;
12936 * which PCIDs have to be flushed. However, CR0.WP and the paging-related
13006 if (!stable && vcpu->cpu == smp_processor_id())
13008 if (stable && vcpu->arch.last_host_tsc > local_tsc) {
13010 if (vcpu->arch.last_host_tsc > max_tsc)
13011 max_tsc = vcpu->arch.last_host_tsc;
13041 * N.B. - this code below runs only on platforms with reliable TSC,
13051 * catchup mode. This will catchup all VCPUs to real time, but cannot
13055 u64 delta_cyc = max_tsc - local_tsc;
13057 kvm->arch.backwards_tsc_observed = true;
13059 vcpu->arch.tsc_offset_adjustment += delta_cyc;
13060 vcpu->arch.last_host_tsc = local_tsc;
13070 kvm->arch.last_tsc_nsec = 0;
13071 kvm->arch.last_tsc_write = 0;
13086 return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
13092 return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
13098 kfree(kvm->arch.hv_pa_pg);
13110 return -EINVAL;
13112 kvm->arch.vm_type = type;
13113 kvm->arch.has_private_mem =
13116 kvm->arch.pre_fault_allowed =
13118 kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks;
13132 atomic_set(&kvm->arch.noncoherent_dma_count, 0);
13134 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
13135 mutex_init(&kvm->arch.apic_map_lock);
13136 seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
13137 kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
13139 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
13141 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
13143 kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
13144 kvm->arch.apic_bus_cycle_ns = APIC_BUS_CYCLE_NS_DEFAULT;
13145 kvm->arch.guest_can_read_msr_platform_info = true;
13146 kvm->arch.enable_pmu = enable_pmu;
13149 spin_lock_init(&kvm->arch.hv_root_tdp_lock);
13150 kvm->arch.hv_root_tdp = INVALID_PAGE;
13153 INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
13154 INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
13167 once_init(&kvm->arch.nx_once);
13191 * -errno: on error
13196 * GPA->HVA translation will not change. However, the HVA is a user
13208 lockdep_assert_held(&kvm->slots_lock);
13211 return ERR_PTR_USR(-EINVAL);
13215 if (slot && slot->npages)
13216 return ERR_PTR_USR(-EEXIST);
13227 if (!slot || !slot->npages)
13230 old_npages = slot->npages;
13231 hva = slot->userspace_addr;
13259 * is unsafe, i.e. will lead to use-after-free. The PIT also needs to
13262 cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
13263 cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
13275 if (current->mm == kvm->mm) {
13281 mutex_lock(&kvm->slots_lock);
13287 mutex_unlock(&kvm->slots_lock);
13290 kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
13295 kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
13296 kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
13309 vfree(slot->arch.rmap[i]);
13310 slot->arch.rmap[i] = NULL;
13321 vfree(slot->arch.lpage_info[i - 1]);
13322 slot->arch.lpage_info[i - 1] = NULL;
13330 const int sz = sizeof(*slot->arch.rmap[0]);
13337 if (slot->arch.rmap[i])
13340 slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
13341 if (!slot->arch.rmap[i]) {
13343 return -ENOMEM;
13353 unsigned long npages = slot->npages;
13361 memset(&slot->arch, 0, sizeof(slot->arch));
13381 slot->arch.lpage_info[i - 1] = linfo;
13383 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
13385 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
13386 linfo[lpages - 1].disallow_lpage = 1;
13387 ugfn = slot->userspace_addr >> PAGE_SHIFT;
13392 if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
13413 vfree(slot->arch.lpage_info[i - 1]);
13414 slot->arch.lpage_info[i - 1] = NULL;
13416 return -ENOMEM;
13425 * memslots->generation has been incremented.
13430 /* Force re-initialization of steal_time cache */
13445 return -EINVAL;
13448 if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
13449 return -EINVAL;
13451 if (kvm_is_gfn_alias(kvm, new->base_gfn + new->npages - 1))
13452 return -EINVAL;
13458 memcpy(&new->arch, &old->arch, sizeof(old->arch));
13460 return -EIO;
13470 if (!kvm->arch.cpu_dirty_log_size)
13473 nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging);
13483 u32 old_flags = old ? old->flags : 0;
13484 u32 new_flags = new ? new->flags : 0;
13504 * CREATE: No shadow pages exist, thus nothing to write-protect
13513 * READONLY and non-flags changes were filtered out above, and the only
13533 * Initially-all-set does not require write protecting any page,
13542 if (kvm->arch.cpu_dirty_log_size) {
13558 * write-protected before returning to userspace, i.e. before
13565 * Specifically, KVM also write-protects guest page tables to
13574 * To handle these scenarios, KVM uses a separate software-only
13575 * bit (MMU-writable) to track if a SPTE is !writable due to
13576 * a guest page table being write-protected (KVM clears the
13577 * MMU-writable flag when write-protecting for shadow paging).
13579 * The use of MMU-writable is also the primary motivation for
13582 * !MMU-writable SPTE, KVM must flush if it encounters any
13583 * MMU-writable SPTE regardless of whether the actual hardware
13586 * write access" helpers to ignore MMU-writable entirely.
13589 * access-tracked SPTEs is particularly relevant).
13603 if (!kvm->arch.n_requested_mmu_pages &&
13607 nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO;
13623 if (vcpu->arch.guest_state_protected)
13633 if (vcpu->arch.guest_state_protected)
13652 if (vcpu->arch.guest_state_protected)
13673 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
13681 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
13682 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
13703 return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
13710 while (vcpu->arch.apf.gfns[key] != ~0)
13713 vcpu->arch.apf.gfns[key] = gfn;
13722 (vcpu->arch.apf.gfns[key] != gfn &&
13723 vcpu->arch.apf.gfns[key] != ~0); i++)
13731 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
13740 if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
13744 vcpu->arch.apf.gfns[i] = ~0;
13747 if (vcpu->arch.apf.gfns[j] == ~0)
13749 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
13756 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
13765 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
13773 return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
13782 if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
13795 if (!vcpu->arch.apf.send_always &&
13796 (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu)))
13804 return vcpu->arch.apf.delivery_as_pf_vmexit;
13808 * The real mode IDT in particular is unlikely to have a #PF
13822 if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
13837 trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
13838 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
13846 fault.address = work->arch.token;
13869 .vector = vcpu->arch.apf.vec
13872 if (work->wakeup_all)
13873 work->arch.token = ~0; /* broadcast wakeup */
13875 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
13876 trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
13878 if ((work->wakeup_all || work->notpresent_injected) &&
13880 !apf_put_user_ready(vcpu, work->arch.token)) {
13881 vcpu->arch.apf.pageready_pending = true;
13885 vcpu->arch.apf.halted = false;
13892 if (!vcpu->arch.apf.pageready_pending)
13907 * Non-coherent DMA assignment and de-assignment may affect whether or
13910 * (or last) non-coherent device is (un)registered to so that new SPTEs
13921 if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1)
13927 if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count))
13933 return atomic_read(&kvm->arch.noncoherent_dma_count);
13939 return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
13996 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
14002 mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) {
14004 * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
14015 vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
14028 if (KVM_BUG_ON(!e, vcpu->kvm))
14029 return -EIO;
14039 * doesn't seem to be a real use-case behind such requests, just return
14095 * page tables, so a non-global flush just degenerates to a
14114 struct kvm_run *run = vcpu->run;
14118 BUG_ON(!vcpu->mmio_needed);
14121 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
14122 len = min(8u, frag->len);
14123 if (!vcpu->mmio_is_write)
14124 memcpy(frag->data, run->mmio.data, len);
14126 if (frag->len <= 8) {
14129 vcpu->mmio_cur_fragment++;
14132 frag->data += len;
14133 frag->gpa += len;
14134 frag->len -= len;
14137 if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
14138 vcpu->mmio_needed = 0;
14146 run->mmio.phys_addr = frag->gpa;
14147 run->mmio.len = min(8u, frag->len);
14148 run->mmio.is_write = vcpu->mmio_is_write;
14149 if (run->mmio.is_write)
14150 memcpy(run->mmio.data, frag->data, min(8u, frag->len));
14151 run->exit_reason = KVM_EXIT_MMIO;
14153 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
14165 return -EINVAL;
14171 bytes -= handled;
14176 frag = vcpu->mmio_fragments;
14177 vcpu->mmio_nr_fragments = 1;
14178 frag->len = bytes;
14179 frag->gpa = gpa;
14180 frag->data = data;
14182 vcpu->mmio_needed = 1;
14183 vcpu->mmio_cur_fragment = 0;
14185 vcpu->run->mmio.phys_addr = gpa;
14186 vcpu->run->mmio.len = min(8u, frag->len);
14187 vcpu->run->mmio.is_write = 1;
14188 memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
14189 vcpu->run->exit_reason = KVM_EXIT_MMIO;
14191 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
14204 return -EINVAL;
14210 bytes -= handled;
14215 frag = vcpu->mmio_fragments;
14216 vcpu->mmio_nr_fragments = 1;
14217 frag->len = bytes;
14218 frag->gpa = gpa;
14219 frag->data = data;
14221 vcpu->mmio_needed = 1;
14222 vcpu->mmio_cur_fragment = 0;
14224 vcpu->run->mmio.phys_addr = gpa;
14225 vcpu->run->mmio.len = min(8u, frag->len);
14226 vcpu->run->mmio.is_write = 0;
14227 vcpu->run->exit_reason = KVM_EXIT_MMIO;
14229 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
14237 vcpu->arch.sev_pio_count -= count;
14238 vcpu->arch.sev_pio_data += count * size;
14246 int size = vcpu->arch.pio.size;
14247 int port = vcpu->arch.pio.port;
14249 vcpu->arch.pio.count = 0;
14250 if (vcpu->arch.sev_pio_count)
14260 min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
14261 int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
14269 if (!vcpu->arch.sev_pio_count)
14273 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs;
14282 unsigned count = vcpu->arch.pio.count;
14283 int size = vcpu->arch.pio.size;
14284 int port = vcpu->arch.pio.port;
14286 complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
14288 if (vcpu->arch.sev_pio_count)
14298 min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
14299 if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count))
14304 if (!vcpu->arch.sev_pio_count)
14308 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
14316 vcpu->arch.sev_pio_data = data;
14317 vcpu->arch.sev_pio_count = count;