Lines Matching +full:use +full:- +full:rtm
1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine driver for Linux
16 * Ben-Ami Yassour <benami@il.ibm.com>
48 #include <linux/user-return-notifier.h>
107 ((struct kvm_vcpu *)(ctxt)->vcpu)
110 * - enable syscall per default because its emulated by KVM
111 * - enable LME and LMA per default on 64 bit KVM
145 *(((struct kvm_x86_ops *)0)->func));
148 #include <asm/kvm-x86-ops.h>
162 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
171 * Flags to manipulate forced emulation behavior (any non-zero value will
178 int __read_mostly pi_inject_timer = -1;
224 * IA32_XSS[bit 8] (guests can/must use RDMSR/WRMSR to save/restore PT MSRs).
458 * List of MSRs that control the existence of MSR-based features, i.e. MSRs
471 (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
563 size - useroffset, NULL);
572 vcpu->arch.apf.gfns[i] = ~0;
592 msrs->registered = false;
596 values = &msrs->values[slot];
597 if (values->host != values->curr) {
598 wrmsrq(kvm_uret_msrs_list[slot], values->host);
599 values->curr = values->host;
624 return -1;
639 return -1;
651 msrs->values[i].host = value;
652 msrs->values[i].curr = value;
658 if (!msrs->registered) {
659 msrs->urn.on_user_return = kvm_on_user_return;
660 user_return_notifier_register(&msrs->urn);
661 msrs->registered = true;
670 value = (value & mask) | (msrs->values[slot].host & ~mask);
671 if (value == msrs->values[slot].curr)
677 msrs->values[slot].curr = value;
685 return this_cpu_ptr(&user_return_msrs)->values[slot].curr;
693 if (msrs->registered)
694 kvm_on_user_return(&msrs->urn);
748 * #DBs can be trap-like or fault-like, the caller must check other CPU
767 if (!ex->has_payload)
770 switch (ex->vector) {
773 * "Certain debug exceptions may clear bit 0-3. The
777 vcpu->arch.dr6 &= ~DR_TRAP_BITS;
786 * Active low bits should be cleared if 1-setting in payload.
787 * Active high bits should be set if 1-setting in payload.
794 vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
795 vcpu->arch.dr6 |= ex->payload;
796 vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW;
804 vcpu->arch.dr6 &= ~BIT(12);
807 vcpu->arch.cr2 = ex->payload;
811 ex->has_payload = false;
812 ex->payload = 0;
820 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
822 ex->vector = vector;
823 ex->injected = false;
824 ex->pending = true;
825 ex->has_error_code = has_error_code;
826 ex->error_code = error_code;
827 ex->has_payload = has_payload;
828 ex->payload = payload;
841 * If the exception is destined for L2, morph it to a VM-Exit if L1
845 kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
851 if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
853 vcpu->arch.exception.pending = true;
854 vcpu->arch.exception.injected = false;
856 vcpu->arch.exception.has_error_code = has_error;
857 vcpu->arch.exception.vector = nr;
858 vcpu->arch.exception.error_code = error_code;
859 vcpu->arch.exception.has_payload = has_payload;
860 vcpu->arch.exception.payload = payload;
863 &vcpu->arch.exception);
868 prev_nr = vcpu->arch.exception.vector;
870 /* triple fault -> shutdown */
882 vcpu->arch.exception.injected = false;
883 vcpu->arch.exception.pending = false;
888 that instruction re-execution will regenerate lost
919 * On VM-Entry, an exception can be pending if and only if event
929 * re-checking is incorrect if _L1_ injected the exception, in which
934 vcpu->arch.exception.injected = true;
935 vcpu->arch.exception.has_error_code = has_error_code;
936 vcpu->arch.exception.vector = nr;
937 vcpu->arch.exception.error_code = error_code;
938 vcpu->arch.exception.has_payload = false;
939 vcpu->arch.exception.payload = 0;
967 ++vcpu->stat.pf_guest;
970 * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of
973 if (is_guest_mode(vcpu) && fault->async_page_fault)
975 true, fault->error_code,
976 true, fault->address);
978 kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
979 fault->address);
986 WARN_ON_ONCE(fault->vector != PF_VECTOR);
988 fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
989 vcpu->arch.walk_mmu;
995 if ((fault->error_code & PFERR_PRESENT_MASK) &&
996 !(fault->error_code & PFERR_RSVD_MASK))
997 kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address,
1000 fault_mmu->inject_page_fault(vcpu, fault);
1006 atomic_inc(&vcpu->arch.nmi_queued);
1042 return (vcpu->arch.apf.msr_en_val & mask) == mask;
1047 return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
1055 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
1060 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
1088 if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
1089 kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
1091 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
1094 vcpu->arch.pdptrs_from_userspace = false;
1119 * CR0.WP is incorporated into the MMU role, but only for non-nested,
1171 if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
1182 if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
1210 if (vcpu->arch.guest_state_protected)
1216 if (vcpu->arch.xcr0 != kvm_host.xcr0)
1218 load_guest ? vcpu->arch.xcr0 : kvm_host.xcr0);
1221 vcpu->arch.ia32_xss != kvm_host.xss)
1222 wrmsrq(MSR_IA32_XSS, load_guest ? vcpu->arch.ia32_xss : kvm_host.xss);
1227 if (vcpu->arch.guest_state_protected)
1231 vcpu->arch.pkru != vcpu->arch.host_pkru &&
1232 ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
1234 wrpkru(vcpu->arch.pkru);
1239 if (vcpu->arch.guest_state_protected)
1243 ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
1245 vcpu->arch.pkru = rdpkru();
1246 if (vcpu->arch.pkru != vcpu->arch.host_pkru)
1247 wrpkru(vcpu->arch.host_pkru);
1254 return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;
1261 u64 old_xcr0 = vcpu->arch.xcr0;
1277 valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
1296 vcpu->arch.xcr0 = xcr0;
1299 vcpu->arch.cpuid_dynamic_bits_dirty = true;
1329 * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
1343 * - CR4.PCIDE is changed from 1 to 0
1344 * - CR4.PGE is toggled
1355 * - CR4.SMEP is changed from 0 to 1
1356 * - CR4.PAE is toggled
1401 struct kvm_mmu *mmu = vcpu->arch.mmu;
1418 * If neither the current CR3 nor any of the prev_roots use the given
1436 if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
1439 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
1472 vcpu->arch.cr3 = cr3;
1481 * and it's impossible to use a non-zero PCID when PCID is disabled,
1498 vcpu->arch.cr8 = cr8;
1508 return vcpu->arch.cr8;
1516 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1518 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
1526 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1527 dr7 = vcpu->arch.guest_debug_dr7;
1529 dr7 = vcpu->arch.dr7;
1531 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
1533 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
1551 size_t size = ARRAY_SIZE(vcpu->arch.db);
1555 vcpu->arch.db[array_index_nospec(dr, size)] = val;
1556 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1557 vcpu->arch.eff_db[dr] = val;
1563 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
1569 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
1580 size_t size = ARRAY_SIZE(vcpu->arch.db);
1584 return vcpu->arch.db[array_index_nospec(dr, size)];
1587 return vcpu->arch.dr6;
1590 return vcpu->arch.dr7;
1614 * 10 - MISC_PACKAGE_CTRLS
1615 * 11 - ENERGY_FILTERING_CTL
1616 * 12 - DOITM
1617 * 18 - FB_CLEAR_CTRL
1618 * 21 - XAPIC_DISABLE_STATUS
1619 * 23 - OVERCLOCKING_STATUS
1667 * If RTM=0 because the kernel has disabled TSX, the host might
1668 * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0
1752 u64 old_efer = vcpu->arch.efer;
1753 u64 efer = msr_info->data;
1759 if (!msr_info->host_initiated) {
1764 (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1769 efer |= vcpu->arch.efer & EFER_LMA;
1797 struct kvm *kvm = vcpu->kvm;
1806 idx = srcu_read_lock(&kvm->srcu);
1808 msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
1814 allowed = msr_filter->default_allow;
1815 ranges = msr_filter->ranges;
1817 for (i = 0; i < msr_filter->count; i++) {
1824 allowed = test_bit(index - start, bitmap);
1830 srcu_read_unlock(&kvm->srcu, idx);
1839 * Returns 0 on success, non-0 otherwise.
1860 * non-canonical address is written on Intel but not on
1861 * AMD (which ignores the top 32-bits, because it does
1862 * not implement 64-bit SYSENTER).
1864 * 64-bit code should hence be able to write a non-canonical
1866 * vmentry does not fail on Intel after writing a non-canonical
1868 * invokes 64-bit SYSENTER.
1887 * clear the bits. This ensures cross-vendor migration will
1929 /* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */
1958 * Returns 0 on success, non-0 otherwise.
2052 if (!vcpu->run->msr.error) {
2053 kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
2054 kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
2060 return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
2071 return kvm_x86_call(complete_emulated_msr)(vcpu, vcpu->run->msr.error);
2082 if (!vcpu->run->msr.error)
2083 kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg,
2084 vcpu->run->msr.data);
2109 if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
2112 vcpu->run->exit_reason = exit_reason;
2113 vcpu->run->msr.error = 0;
2114 memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
2115 vcpu->run->msr.reason = msr_reason;
2116 vcpu->run->msr.index = index;
2117 vcpu->run->msr.data = data;
2118 vcpu->arch.complete_userspace_io = completion;
2135 kvm_rax_write(vcpu, data & -1u);
2136 kvm_rdx_write(vcpu, (data >> 32) & -1u);
2153 return __kvm_emulate_rdmsr(vcpu, kvm_rcx_read(vcpu), -1,
2160 vcpu->arch.cui_rdmsr_imm_reg = reg;
2233 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS))
2236 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT))
2239 enabled = vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT;
2264 return READ_ONCE(vcpu->mode) == EXITING_GUEST_MODE ||
2272 if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) ||
2273 kvm_x2apic_icr_write_fast(vcpu->arch.apic, data))
2325 return -EINVAL;
2357 write_seqcount_begin(&vdata->seq);
2360 vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode;
2361 vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
2362 vdata->clock.mask = tk->tkr_mono.mask;
2363 vdata->clock.mult = tk->tkr_mono.mult;
2364 vdata->clock.shift = tk->tkr_mono.shift;
2365 vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec;
2366 vdata->clock.offset = tk->tkr_mono.base;
2368 vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode;
2369 vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
2370 vdata->raw_clock.mask = tk->tkr_raw.mask;
2371 vdata->raw_clock.mult = tk->tkr_raw.mult;
2372 vdata->raw_clock.shift = tk->tkr_raw.shift;
2373 vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec;
2374 vdata->raw_clock.offset = tk->tkr_raw.base;
2376 vdata->wall_time_sec = tk->xtime_sec;
2378 vdata->offs_boot = tk->offs_boot;
2380 write_seqcount_end(&vdata->seq);
2391 /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
2440 struct kvm_arch *ka = &vcpu->kvm->arch;
2442 if (vcpu->vcpu_id == 0 && !host_initiated) {
2443 if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
2446 ka->boot_vcpu_runs_old_kvmclock = old_msr;
2449 vcpu->arch.time = system_time;
2454 kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
2457 kvm_gpc_deactivate(&vcpu->arch.pv_time);
2480 shift--;
2525 vcpu->arch.tsc_catchup = 1;
2526 vcpu->arch.tsc_always_catchup = 1;
2530 return -1;
2534 /* TSC scaling required - calculate ratio */
2539 pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
2541 return -1;
2557 return -1;
2562 &vcpu->arch.virtual_tsc_shift,
2563 &vcpu->arch.virtual_tsc_mult);
2564 vcpu->arch.virtual_tsc_khz = user_tsc_khz;
2572 thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
2584 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
2585 vcpu->arch.virtual_tsc_mult,
2586 vcpu->arch.virtual_tsc_shift);
2587 tsc += vcpu->arch.this_tsc_write;
2601 struct kvm_arch *ka = &vcpu->kvm->arch;
2605 * To use the masterclock, the host clocksource must be based on TSC
2609 bool use_master_clock = (ka->nr_vcpus_matched_tsc + 1 ==
2610 atomic_read(&vcpu->kvm->online_vcpus)) &&
2611 gtod_is_based_on_tsc(gtod->clock.vclock_mode);
2619 if ((ka->use_master_clock && new_generation) ||
2620 (ka->use_master_clock != use_master_clock))
2623 trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
2624 atomic_read(&vcpu->kvm->online_vcpus),
2625 ka->use_master_clock, gtod->clock.vclock_mode);
2632 * The most significant 64-N bits (mult) of ratio represent the
2635 * point number (mult + frac * 2^(-N)).
2658 tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
2660 return target_tsc - tsc;
2665 return vcpu->arch.l1_tsc_offset +
2666 kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
2697 if (vcpu->arch.guest_tsc_protected)
2700 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2701 vcpu->arch.l1_tsc_offset,
2704 vcpu->arch.l1_tsc_offset = l1_offset;
2712 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2717 vcpu->arch.tsc_offset = l1_offset;
2724 vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
2728 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2732 vcpu->arch.tsc_scaling_ratio = l1_multiplier;
2742 * TSC is marked unstable when we're running on Hyper-V,
2759 struct kvm *kvm = vcpu->kvm;
2761 lockdep_assert_held(&kvm->arch.tsc_write_lock);
2763 if (vcpu->arch.guest_tsc_protected)
2767 vcpu->kvm->arch.user_set_tsc = true;
2773 kvm->arch.last_tsc_nsec = ns;
2774 kvm->arch.last_tsc_write = tsc;
2775 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
2776 kvm->arch.last_tsc_offset = offset;
2778 vcpu->arch.last_guest_tsc = tsc;
2790 * These values are tracked in kvm->arch.cur_xxx variables.
2792 kvm->arch.cur_tsc_generation++;
2793 kvm->arch.cur_tsc_nsec = ns;
2794 kvm->arch.cur_tsc_write = tsc;
2795 kvm->arch.cur_tsc_offset = offset;
2796 kvm->arch.nr_vcpus_matched_tsc = 0;
2797 } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
2798 kvm->arch.nr_vcpus_matched_tsc++;
2802 vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
2803 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
2804 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
2812 struct kvm *kvm = vcpu->kvm;
2818 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
2821 elapsed = ns - kvm->arch.last_tsc_nsec;
2823 if (vcpu->arch.virtual_tsc_khz) {
2830 } else if (kvm->arch.user_set_tsc) {
2831 u64 tsc_exp = kvm->arch.last_tsc_write +
2833 u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
2835 * Here lies UAPI baggage: when a user-initiated TSC write has
2846 * come from the kernel's default vCPU creation. Make the 1-second
2862 vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
2864 offset = kvm->arch.cur_tsc_offset;
2874 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
2880 u64 tsc_offset = vcpu->arch.l1_tsc_offset;
2886 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio)
2889 vcpu->arch.l1_tsc_scaling_ratio);
2921 switch (clock->vclock_mode) {
2927 v = (tsc_pg_val - clock->cycle_last) &
2928 clock->mask;
2937 v = (*tsc_timestamp - clock->cycle_last) &
2938 clock->mask;
2947 return v * clock->mult;
2952 * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot).
2962 seq = read_seqcount_begin(>od->seq);
2963 ns = gtod->raw_clock.base_cycles;
2964 ns += vgettsc(>od->raw_clock, tsc_timestamp, &mode);
2965 ns >>= gtod->raw_clock.shift;
2966 ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
2967 } while (unlikely(read_seqcount_retry(>od->seq, seq)));
2985 seq = read_seqcount_begin(>od->seq);
2986 ns = gtod->clock.base_cycles;
2987 ns += vgettsc(>od->clock, tsc_timestamp, &mode);
2988 ns >>= gtod->clock.shift;
2989 ns += ktime_to_ns(gtod->clock.offset);
2990 } while (unlikely(read_seqcount_retry(>od->seq, seq)));
3004 seq = read_seqcount_begin(>od->seq);
3005 ts->tv_sec = gtod->wall_time_sec;
3006 ns = gtod->clock.base_cycles;
3007 ns += vgettsc(>od->clock, tsc_timestamp, &mode);
3008 ns >>= gtod->clock.shift;
3009 } while (unlikely(read_seqcount_retry(>od->seq, seq)));
3011 ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
3012 ts->tv_nsec = ns;
3050 * DO NOT USE this for anything related to migration. You want CLOCK_TAI
3080 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
3081 * 5. | ret1 = timespec1 + (rdtsc - tsc1)
3082 * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
3086 * - ret0 < ret1
3087 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
3089 * - 0 < N - M => M < N
3097 * system_timestamp/tsc_timestamp values simultaneously: use a master
3108 struct kvm_arch *ka = &kvm->arch;
3112 lockdep_assert_held(&kvm->arch.tsc_write_lock);
3113 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
3114 atomic_read(&kvm->online_vcpus));
3121 &ka->master_kernel_ns,
3122 &ka->master_cycle_now);
3124 ka->use_master_clock = host_tsc_clocksource && vcpus_matched
3125 && !ka->backwards_tsc_observed
3126 && !ka->boot_vcpu_runs_old_kvmclock;
3128 if (ka->use_master_clock)
3132 trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
3144 raw_spin_lock_irq(&kvm->arch.tsc_write_lock);
3145 write_seqcount_begin(&kvm->arch.pvclock_sc);
3158 struct kvm_arch *ka = &kvm->arch;
3162 write_seqcount_end(&ka->pvclock_sc);
3163 raw_spin_unlock_irq(&ka->tsc_write_lock);
3181 * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's
3182 * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz
3196 /* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */
3199 struct kvm_arch *ka = &kvm->arch;
3205 data->flags = 0;
3206 if (ka->use_master_clock &&
3211 if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
3212 data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
3213 data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
3216 data->host_tsc = rdtsc();
3218 data->flags |= KVM_CLOCK_TSC_STABLE;
3219 hv_clock.tsc_timestamp = ka->master_cycle_now;
3220 hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
3224 data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
3226 data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
3234 struct kvm_arch *ka = &kvm->arch;
3238 seq = read_seqcount_begin(&ka->pvclock_sc);
3240 } while (read_seqcount_retry(&ka->pvclock_sc, seq));
3262 read_lock_irqsave(&gpc->lock, flags);
3264 read_unlock_irqrestore(&gpc->lock, flags);
3269 read_lock_irqsave(&gpc->lock, flags);
3272 guest_hv_clock = (void *)(gpc->khva + offset);
3281 guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1;
3285 hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
3291 guest_hv_clock->version = ++hv_clock.version;
3294 read_unlock_irqrestore(&gpc->lock, flags);
3296 trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock);
3304 struct kvm_vcpu_arch *vcpu = &v->arch;
3305 struct kvm_arch *ka = &v->kvm->arch;
3318 seq = read_seqcount_begin(&ka->pvclock_sc);
3319 use_master_clock = ka->use_master_clock;
3321 host_tsc = ka->master_cycle_now;
3322 kernel_ns = ka->master_kernel_ns;
3324 } while (read_seqcount_retry(&ka->pvclock_sc, seq));
3351 if (vcpu->tsc_catchup) {
3354 adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
3365 v->arch.l1_tsc_scaling_ratio);
3369 if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
3371 &vcpu->pvclock_tsc_shift,
3372 &vcpu->pvclock_tsc_mul);
3373 vcpu->hw_tsc_khz = tgt_tsc_khz;
3376 hv_clock.tsc_shift = vcpu->pvclock_tsc_shift;
3377 hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul;
3379 hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
3380 vcpu->last_guest_tsc = tsc_timestamp;
3387 if (vcpu->pv_time.active) {
3393 if (vcpu->pvclock_set_guest_stopped_request) {
3395 vcpu->pvclock_set_guest_stopped_request = false;
3397 kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0);
3402 kvm_hv_setup_tsc_page(v->kvm, &hv_clock);
3407 * explicitly told to use TSC as its clocksource Xen will not set this bit.
3413 if (ka->xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
3416 if (vcpu->xen.vcpu_info_cache.active)
3417 kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache,
3419 if (vcpu->xen.vcpu_time_info_cache.active)
3420 kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0);
3448 struct kvm_arch *ka = &kvm->arch;
3454 seq = read_seqcount_begin(&ka->pvclock_sc);
3457 if (!ka->use_master_clock)
3479 hv_clock.tsc_timestamp = ka->master_cycle_now;
3480 hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
3482 } while (read_seqcount_retry(&ka->pvclock_sc, seq));
3488 * since 1970-01-01.
3494 return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
3498 return ktime_get_real_ns() - get_kvmclock_ns(kvm);
3503 * vcpu->cpu migration, should not allow system_timestamp from
3514 struct kvm *kvm = v->kvm;
3539 return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
3546 u64 mcg_cap = vcpu->arch.mcg_cap;
3548 u32 msr = msr_info->index;
3549 u64 data = msr_info->data;
3554 vcpu->arch.mcg_status = data;
3558 (data || !msr_info->host_initiated))
3562 vcpu->arch.mcg_ctl = data;
3564 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
3565 last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
3569 if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
3574 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
3575 last_msr + 1 - MSR_IA32_MC0_CTL2);
3576 vcpu->arch.mci_ctl2_banks[offset] = data;
3578 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3579 last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
3591 * single-bit ECC data errors.
3599 * AMD-based CPUs allow non-zero values, but if and only if
3602 if (!msr_info->host_initiated && is_mci_status_msr(msr) &&
3606 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
3607 last_msr + 1 - MSR_IA32_MC0_CTL);
3608 vcpu->arch.mce_banks[offset] = data;
3635 vcpu->arch.apf.msr_en_val = data;
3643 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
3647 vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS);
3648 vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
3657 /* Bits 8-63 are reserved */
3664 vcpu->arch.apf.msr_int_val = data;
3666 vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
3673 kvm_gpc_deactivate(&vcpu->arch.pv_time);
3674 vcpu->arch.time = 0;
3679 ++vcpu->stat.tlb_flush;
3688 ++vcpu->stat.tlb_flush;
3704 * Flushing all "guest" TLB is always a superset of Hyper-V's fine
3713 ++vcpu->stat.tlb_flush;
3721 * prior before nested VM-Enter/VM-Exit.
3735 struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
3738 gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
3742 if (kvm_xen_msr_enabled(vcpu->kvm)) {
3747 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3750 if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
3753 slots = kvm_memslots(vcpu->kvm);
3755 if (unlikely(slots->generation != ghc->generation ||
3756 gpa != ghc->gpa ||
3757 kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
3759 BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
3761 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
3762 kvm_is_error_hva(ghc->hva) || !ghc->memslot)
3766 st = (struct kvm_steal_time __user *)ghc->hva;
3773 int err = -EFAULT;
3784 "+m" (st->preempted));
3790 vcpu->arch.st.preempted = 0;
3792 trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
3803 unsafe_put_user(0, &st->preempted, out);
3804 vcpu->arch.st.preempted = 0;
3807 unsafe_get_user(version, &st->version, out);
3812 unsafe_put_user(version, &st->version, out);
3816 unsafe_get_user(steal, &st->steal, out);
3817 steal += current->sched_info.run_delay -
3818 vcpu->arch.st.last_steal;
3819 vcpu->arch.st.last_steal = current->sched_info.run_delay;
3820 unsafe_put_user(steal, &st->steal, out);
3823 unsafe_put_user(version, &st->version, out);
3828 mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
3854 * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an
3859 * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the
3860 * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only
3870 KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm);
3871 KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm);
3875 rdmsrq(msr_info->index, msr_info->data);
3877 wrmsrq(msr_info->index, msr_info->data);
3893 u32 msr = msr_info->index;
3894 u64 data = msr_info->data;
3897 * Do not allow host-initiated writes to trigger the Xen hypercall
3901 if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) &&
3902 !msr_info->host_initiated)
3917 if (msr_info->host_initiated)
3918 vcpu->arch.microcode_version = data;
3921 if (!msr_info->host_initiated ||
3924 vcpu->arch.arch_capabilities = data;
3927 if (!msr_info->host_initiated ||
3939 if (vcpu->arch.perf_capabilities == data)
3942 vcpu->arch.perf_capabilities = data;
3948 if (!msr_info->host_initiated) {
3976 if (!msr_info->host_initiated &&
4003 vcpu->arch.msr_hwcr = data;
4015 vcpu->arch.pat = data;
4021 return kvm_apic_set_base(vcpu, data, msr_info->host_initiated);
4029 if (!msr_info->host_initiated) {
4030 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
4037 vcpu->arch.ia32_tsc_adjust_msr = data;
4041 u64 old_val = vcpu->arch.ia32_misc_enable_msr;
4043 if (!msr_info->host_initiated) {
4053 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
4057 vcpu->arch.ia32_misc_enable_msr = data;
4058 vcpu->arch.cpuid_dynamic_bits_dirty = true;
4060 vcpu->arch.ia32_misc_enable_msr = data;
4065 if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
4067 vcpu->arch.smbase = data;
4070 vcpu->arch.msr_ia32_power_ctl = data;
4073 if (msr_info->host_initiated) {
4075 } else if (!vcpu->arch.guest_tsc_protected) {
4076 u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
4078 vcpu->arch.ia32_tsc_adjust_msr += adj;
4085 if (data & ~vcpu->arch.guest_supported_xss)
4087 if (vcpu->arch.ia32_xss == data)
4089 vcpu->arch.ia32_xss = data;
4090 vcpu->arch.cpuid_dynamic_bits_dirty = true;
4093 if (!msr_info->host_initiated)
4095 vcpu->arch.smi_count = data;
4101 vcpu->kvm->arch.wall_clock = data;
4102 kvm_write_wall_clock(vcpu->kvm, data, 0);
4108 vcpu->kvm->arch.wall_clock = data;
4109 kvm_write_wall_clock(vcpu->kvm, data, 0);
4115 kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
4121 kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
4145 smp_store_mb(vcpu->arch.apf.pageready_pending, false);
4160 vcpu->arch.st.msr_val = data;
4181 if (data & (-1ULL << 1))
4184 vcpu->arch.msr_kvm_poll_control = data;
4189 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4190 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4207 * all pre-dating SVM, but a recommended workaround from
4225 msr_info->host_initiated);
4228 /* Drop writes to this legacy MSR -- see rdmsr
4236 vcpu->arch.osvw.length = data;
4241 vcpu->arch.osvw.status = data;
4244 if (!msr_info->host_initiated)
4246 vcpu->arch.msr_platform_info = data;
4253 vcpu->arch.msr_misc_features_enables = data;
4257 if (!msr_info->host_initiated &&
4264 fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
4267 if (!msr_info->host_initiated &&
4274 vcpu->arch.guest_fpu.xfd_err = data;
4294 u64 mcg_cap = vcpu->arch.mcg_cap;
4304 data = vcpu->arch.mcg_cap;
4309 data = vcpu->arch.mcg_ctl;
4312 data = vcpu->arch.mcg_status;
4314 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4315 last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
4321 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
4322 last_msr + 1 - MSR_IA32_MC0_CTL2);
4323 data = vcpu->arch.mci_ctl2_banks[offset];
4325 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4326 last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
4330 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
4331 last_msr + 1 - MSR_IA32_MC0_CTL);
4332 data = vcpu->arch.mce_banks[offset];
4343 switch (msr_info->index) {
4366 * so for existing CPU-specific MSRs.
4373 msr_info->data = 0;
4379 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
4381 msr_info->data = 0;
4384 msr_info->data = vcpu->arch.microcode_version;
4389 msr_info->data = vcpu->arch.arch_capabilities;
4394 msr_info->data = vcpu->arch.perf_capabilities;
4397 msr_info->data = vcpu->arch.msr_ia32_power_ctl;
4406 * return L1's TSC value to ensure backwards-compatible
4411 if (msr_info->host_initiated) {
4412 offset = vcpu->arch.l1_tsc_offset;
4413 ratio = vcpu->arch.l1_tsc_scaling_ratio;
4415 offset = vcpu->arch.tsc_offset;
4416 ratio = vcpu->arch.tsc_scaling_ratio;
4419 msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
4423 msr_info->data = vcpu->arch.pat;
4428 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
4430 msr_info->data = 3;
4444 msr_info->data = 1 << 24;
4447 msr_info->data = vcpu->arch.apic_base;
4450 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
4452 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
4455 msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
4458 msr_info->data = vcpu->arch.ia32_misc_enable_msr;
4461 if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
4463 msr_info->data = vcpu->arch.smbase;
4466 msr_info->data = vcpu->arch.smi_count;
4470 msr_info->data = 1000ULL;
4472 msr_info->data |= (((uint64_t)4ULL) << 40);
4475 msr_info->data = vcpu->arch.efer;
4481 msr_info->data = vcpu->kvm->arch.wall_clock;
4487 msr_info->data = vcpu->kvm->arch.wall_clock;
4493 msr_info->data = vcpu->arch.time;
4499 msr_info->data = vcpu->arch.time;
4505 msr_info->data = vcpu->arch.apf.msr_en_val;
4511 msr_info->data = vcpu->arch.apf.msr_int_val;
4517 msr_info->data = 0;
4523 msr_info->data = vcpu->arch.st.msr_val;
4529 msr_info->data = vcpu->arch.pv_eoi.msr_val;
4535 msr_info->data = vcpu->arch.msr_kvm_poll_control;
4542 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4543 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4544 return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
4545 msr_info->host_initiated);
4547 if (!msr_info->host_initiated &&
4550 msr_info->data = vcpu->arch.ia32_xss;
4554 * Provide expected ramp-up count for K7. All other
4562 msr_info->data = 0x20000000;
4576 msr_info->index, &msr_info->data,
4577 msr_info->host_initiated);
4590 msr_info->data = 0xbe702111;
4595 msr_info->data = vcpu->arch.osvw.length;
4600 msr_info->data = vcpu->arch.osvw.status;
4603 if (!msr_info->host_initiated &&
4604 !vcpu->kvm->arch.guest_can_read_msr_platform_info)
4606 msr_info->data = vcpu->arch.msr_platform_info;
4609 msr_info->data = vcpu->arch.msr_misc_features_enables;
4612 msr_info->data = vcpu->arch.msr_hwcr;
4616 if (!msr_info->host_initiated &&
4620 msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
4623 if (!msr_info->host_initiated &&
4627 msr_info->data = vcpu->arch.guest_fpu.xfd_err;
4635 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
4657 for (i = 0; i < msrs->nmsrs; ++i) {
4659 * If userspace is accessing one or more XSTATE-managed MSRs,
4692 r = -EFAULT;
4696 r = -E2BIG;
4701 entries = memdup_user(user_msrs->entries, size);
4709 if (writeback && copy_to_user(user_msrs->entries, entries, size))
4710 r = -EFAULT;
4748 r = -EFAULT;
4752 r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
4756 r = -EFAULT;
4771 return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS;
4914 r = kvm->max_vcpus;
4936 r = kvm_x86_ops.nested_ops->get_state ?
4937 kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
4944 r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
4989 if (attr->group) {
4991 return kvm_x86_call(dev_get_attr)(attr->group, attr->attr, val);
4992 return -ENXIO;
4995 switch (attr->attr) {
5000 return -ENXIO;
5006 u64 __user *uaddr = u64_to_user_ptr(attr->addr);
5015 return -EFAULT;
5039 r = -EFAULT;
5046 r = -E2BIG;
5049 r = -EFAULT;
5050 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
5053 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
5065 r = -EFAULT;
5069 r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
5074 r = -EFAULT;
5081 r = -EFAULT;
5092 r = -EFAULT;
5099 r = -E2BIG;
5102 r = -EFAULT;
5103 if (copy_to_user(user_msr_list->indices, &msr_based_features,
5119 r = -EFAULT;
5127 r = -EFAULT;
5134 r = -EINVAL;
5143 return kvm_arch_has_noncoherent_dma(vcpu->kvm);
5154 if (vcpu->scheduled_out && pmu->version && pmu->event_count) {
5155 pmu->need_cleanup = true;
5162 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
5163 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
5164 wbinvd_on_cpu(vcpu->cpu);
5175 * is handled on the nested VM-Exit path.
5183 vcpu->arch.host_pkru = read_pkru();
5186 if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
5187 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
5188 vcpu->arch.tsc_offset_adjustment = 0;
5192 if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
5193 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
5194 rdtsc() - vcpu->arch.last_host_tsc;
5200 vcpu->arch.last_guest_tsc);
5202 if (!vcpu->arch.guest_tsc_protected)
5203 vcpu->arch.tsc_catchup = 1;
5211 * kvmclock on vcpu->cpu migration
5213 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
5215 if (vcpu->cpu != cpu)
5217 vcpu->cpu = cpu;
5225 struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
5229 gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
5232 * The vCPU can be marked preempted if and only if the VM-Exit was on
5236 * preempted if and only if the VM-Exit was due to a host interrupt.
5238 if (!vcpu->arch.at_instruction_boundary) {
5239 vcpu->stat.preemption_other++;
5243 vcpu->stat.preemption_reported++;
5244 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
5247 if (vcpu->arch.st.preempted)
5251 if (unlikely(current->mm != vcpu->kvm->mm))
5254 slots = kvm_memslots(vcpu->kvm);
5256 if (unlikely(slots->generation != ghc->generation ||
5257 gpa != ghc->gpa ||
5258 kvm_is_error_hva(ghc->hva) || !ghc->memslot))
5261 st = (struct kvm_steal_time __user *)ghc->hva;
5262 BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
5264 if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
5265 vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
5267 mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
5274 if (vcpu->preempted) {
5276 * Assume protected guests are in-kernel. Inefficient yielding
5280 vcpu->arch.preempted_in_kernel = vcpu->arch.guest_state_protected ||
5287 idx = srcu_read_lock(&vcpu->kvm->srcu);
5288 if (kvm_xen_msr_enabled(vcpu->kvm))
5292 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5296 vcpu->arch.last_host_tsc = rdtsc();
5302 if (vcpu->arch.apic->guest_apic_protected)
5303 return -EINVAL;
5315 if (vcpu->arch.apic->guest_apic_protected)
5316 return -EINVAL;
5349 * instruction boundary and with no events half-injected.
5360 if (irq->irq >= KVM_NR_INTERRUPTS)
5361 return -EINVAL;
5363 if (!irqchip_in_kernel(vcpu->kvm)) {
5364 kvm_queue_interrupt(vcpu, irq->irq, false);
5370 * With in-kernel LAPIC, we only use this to inject EXTINT, so
5371 * fail for in-kernel 8259.
5373 if (pic_in_kernel(vcpu->kvm))
5374 return -ENXIO;
5376 if (vcpu->arch.pending_external_vector != -1)
5377 return -EEXIST;
5379 vcpu->arch.pending_external_vector = irq->irq;
5394 if (tac->flags)
5395 return -EINVAL;
5396 vcpu->arch.tpr_access_reporting = !!tac->enabled;
5406 r = -EINVAL;
5412 vcpu->arch.mcg_cap = mcg_cap;
5415 vcpu->arch.mcg_ctl = ~(u64)0;
5418 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
5420 vcpu->arch.mci_ctl2_banks[bank] = 0;
5433 * - none of the bits for Machine Check Exceptions are set
5434 * - both the VAL (valid) and UC (uncorrectable) bits are set
5435 * MCI_STATUS_PCC - Processor Context Corrupted
5436 * MCI_STATUS_S - Signaled as a Machine Check Exception
5437 * MCI_STATUS_AR - Software recoverable Action Required
5441 return !mce->mcg_status &&
5442 !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) &&
5443 (mce->status & MCI_STATUS_VAL) &&
5444 (mce->status & MCI_STATUS_UC);
5449 u64 mcg_cap = vcpu->arch.mcg_cap;
5451 banks[1] = mce->status;
5452 banks[2] = mce->addr;
5453 banks[3] = mce->misc;
5454 vcpu->arch.mcg_status = mce->mcg_status;
5457 !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))
5461 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
5469 u64 mcg_cap = vcpu->arch.mcg_cap;
5471 u64 *banks = vcpu->arch.mce_banks;
5473 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
5474 return -EINVAL;
5476 banks += array_index_nospec(4 * mce->bank, 4 * bank_num);
5485 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
5486 vcpu->arch.mcg_ctl != ~(u64)0)
5492 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
5494 if (mce->status & MCI_STATUS_UC) {
5495 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
5501 mce->status |= MCI_STATUS_OVER;
5502 banks[2] = mce->addr;
5503 banks[3] = mce->misc;
5504 vcpu->arch.mcg_status = mce->mcg_status;
5505 banks[1] = mce->status;
5510 mce->status |= MCI_STATUS_OVER;
5511 banks[2] = mce->addr;
5512 banks[3] = mce->misc;
5513 banks[1] = mce->status;
5534 * non-exiting _injected_ exception, and a pending exiting exception.
5535 * In that case, ignore the VM-Exiting exception as it's an extension
5538 if (vcpu->arch.exception_vmexit.pending &&
5539 !vcpu->arch.exception.pending &&
5540 !vcpu->arch.exception.injected)
5541 ex = &vcpu->arch.exception_vmexit;
5543 ex = &vcpu->arch.exception;
5548 * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability,
5553 if (!vcpu->kvm->arch.exception_payload_enabled &&
5554 ex->pending && ex->has_payload)
5565 if (!kvm_exception_is_soft(ex->vector)) {
5566 events->exception.injected = ex->injected;
5567 events->exception.pending = ex->pending;
5573 if (!vcpu->kvm->arch.exception_payload_enabled)
5574 events->exception.injected |= ex->pending;
5576 events->exception.nr = ex->vector;
5577 events->exception.has_error_code = ex->has_error_code;
5578 events->exception.error_code = ex->error_code;
5579 events->exception_has_payload = ex->has_payload;
5580 events->exception_payload = ex->payload;
5582 events->interrupt.injected =
5583 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
5584 events->interrupt.nr = vcpu->arch.interrupt.nr;
5585 events->interrupt.shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
5587 events->nmi.injected = vcpu->arch.nmi_injected;
5588 events->nmi.pending = kvm_get_nr_pending_nmis(vcpu);
5589 events->nmi.masked = kvm_x86_call(get_nmi_mask)(vcpu);
5591 /* events->sipi_vector is never valid when reporting to user space */
5594 events->smi.smm = is_smm(vcpu);
5595 events->smi.pending = vcpu->arch.smi_pending;
5596 events->smi.smm_inside_nmi =
5597 !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
5599 events->smi.latched_init = kvm_lapic_latched_init(vcpu);
5601 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
5604 if (vcpu->kvm->arch.exception_payload_enabled)
5605 events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
5606 if (vcpu->kvm->arch.triple_fault_event) {
5607 events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5608 events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
5615 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
5621 return -EINVAL;
5623 if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
5624 if (!vcpu->kvm->arch.exception_payload_enabled)
5625 return -EINVAL;
5626 if (events->exception.pending)
5627 events->exception.injected = 0;
5629 events->exception_has_payload = 0;
5631 events->exception.pending = 0;
5632 events->exception_has_payload = 0;
5635 if ((events->exception.injected || events->exception.pending) &&
5636 (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
5637 return -EINVAL;
5643 * morph the exception to a VM-Exit if appropriate. Do this only for
5644 * pending exceptions, already-injected exceptions are not subject to
5647 * pending exception, which in turn may cause a spurious VM-Exit.
5649 vcpu->arch.exception_from_userspace = events->exception.pending;
5651 vcpu->arch.exception_vmexit.pending = false;
5653 vcpu->arch.exception.injected = events->exception.injected;
5654 vcpu->arch.exception.pending = events->exception.pending;
5655 vcpu->arch.exception.vector = events->exception.nr;
5656 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
5657 vcpu->arch.exception.error_code = events->exception.error_code;
5658 vcpu->arch.exception.has_payload = events->exception_has_payload;
5659 vcpu->arch.exception.payload = events->exception_payload;
5661 vcpu->arch.interrupt.injected = events->interrupt.injected;
5662 vcpu->arch.interrupt.nr = events->interrupt.nr;
5663 vcpu->arch.interrupt.soft = events->interrupt.soft;
5664 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
5666 events->interrupt.shadow);
5668 vcpu->arch.nmi_injected = events->nmi.injected;
5669 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
5670 vcpu->arch.nmi_pending = 0;
5671 atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
5672 if (events->nmi.pending)
5675 kvm_x86_call(set_nmi_mask)(vcpu, events->nmi.masked);
5677 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
5679 vcpu->arch.apic->sipi_vector = events->sipi_vector;
5681 if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
5683 if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
5685 kvm_smm_changed(vcpu, events->smi.smm);
5688 vcpu->arch.smi_pending = events->smi.pending;
5690 if (events->smi.smm) {
5691 if (events->smi.smm_inside_nmi)
5692 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
5694 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
5698 if (events->smi.smm || events->smi.pending ||
5699 events->smi.smm_inside_nmi)
5700 return -EINVAL;
5704 if (events->smi.latched_init)
5705 set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
5707 clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
5711 if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
5712 if (!vcpu->kvm->arch.triple_fault_event)
5713 return -EINVAL;
5714 if (events->triple_fault.pending)
5730 if (vcpu->kvm->arch.has_protected_state &&
5731 vcpu->arch.guest_state_protected)
5732 return -EINVAL;
5736 BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
5737 for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
5738 dbgregs->db[i] = vcpu->arch.db[i];
5740 dbgregs->dr6 = vcpu->arch.dr6;
5741 dbgregs->dr7 = vcpu->arch.dr7;
5750 if (vcpu->kvm->arch.has_protected_state &&
5751 vcpu->arch.guest_state_protected)
5752 return -EINVAL;
5754 if (dbgregs->flags)
5755 return -EINVAL;
5757 if (!kvm_dr6_valid(dbgregs->dr6))
5758 return -EINVAL;
5759 if (!kvm_dr7_valid(dbgregs->dr7))
5760 return -EINVAL;
5762 for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
5763 vcpu->arch.db[i] = dbgregs->db[i];
5766 vcpu->arch.dr6 = dbgregs->dr6;
5767 vcpu->arch.dr7 = dbgregs->dr7;
5789 u64 supported_xcr0 = vcpu->arch.guest_supported_xcr0 |
5792 if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
5793 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
5795 fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
5796 supported_xcr0, vcpu->arch.pkru);
5803 return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
5804 sizeof(guest_xsave->region));
5810 if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
5811 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
5813 return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
5814 guest_xsave->region,
5816 &vcpu->arch.pkru);
5822 if (vcpu->kvm->arch.has_protected_state &&
5823 vcpu->arch.guest_state_protected)
5824 return -EINVAL;
5827 guest_xcrs->nr_xcrs = 0;
5831 guest_xcrs->nr_xcrs = 1;
5832 guest_xcrs->flags = 0;
5833 guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
5834 guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
5843 if (vcpu->kvm->arch.has_protected_state &&
5844 vcpu->arch.guest_state_protected)
5845 return -EINVAL;
5848 return -EINVAL;
5850 if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
5851 return -EINVAL;
5853 for (i = 0; i < guest_xcrs->nr_xcrs; i++)
5855 if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
5857 guest_xcrs->xcrs[i].value);
5861 r = -EINVAL;
5873 if (!vcpu->arch.pv_time.active)
5874 return -EINVAL;
5875 vcpu->arch.pvclock_set_guest_stopped_request = true;
5885 switch (attr->attr) {
5890 r = -ENXIO;
5899 u64 __user *uaddr = u64_to_user_ptr(attr->addr);
5902 switch (attr->attr) {
5904 r = -EFAULT;
5905 if (put_user(vcpu->arch.l1_tsc_offset, uaddr))
5910 r = -ENXIO;
5919 u64 __user *uaddr = u64_to_user_ptr(attr->addr);
5920 struct kvm *kvm = vcpu->kvm;
5923 switch (attr->attr) {
5929 r = -EFAULT;
5933 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
5935 matched = (vcpu->arch.virtual_tsc_khz &&
5936 kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
5937 kvm->arch.last_tsc_offset == offset);
5939 tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
5943 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
5949 r = -ENXIO;
5963 return -EFAULT;
5966 return -ENXIO;
5986 if (cap->flags)
5987 return -EINVAL;
5989 switch (cap->cap) {
5992 if (cap->args[0])
5993 return -EINVAL;
5997 if (!irqchip_in_kernel(vcpu->kvm))
5998 return -EINVAL;
5999 return kvm_hv_activate_synic(vcpu, cap->cap ==
6007 if (!kvm_x86_ops.nested_ops->enable_evmcs)
6008 return -ENOTTY;
6009 r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
6011 user_ptr = (void __user *)(uintptr_t)cap->args[0];
6014 r = -EFAULT;
6020 return -ENOTTY;
6025 return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
6029 vcpu->arch.pv_cpuid.enforce = cap->args[0];
6032 return -EINVAL;
6048 switch (reg->index) {
6051 * FIXME: If host-initiated accesses are ever exempted from
6057 return -EINVAL;
6059 reg->type = KVM_X86_REG_TYPE_MSR;
6060 reg->index = MSR_KVM_INTERNAL_GUEST_SSP;
6063 return -EINVAL;
6073 return -EINVAL;
6076 return -EFAULT;
6086 return -EFAULT;
6089 return -EINVAL;
6104 return -EFAULT;
6107 return -EINVAL;
6110 if (reg->rsvd1 || reg->rsvd2)
6111 return -EINVAL;
6113 if (reg->type == KVM_X86_REG_TYPE_KVM) {
6119 if (reg->type != KVM_X86_REG_TYPE_MSR)
6120 return -EINVAL;
6123 return -EINVAL;
6125 guard(srcu)(&vcpu->kvm->srcu);
6127 load_fpu = is_xstate_managed_msr(vcpu, reg->index);
6133 r = kvm_get_one_msr(vcpu, reg->index, user_val);
6135 r = kvm_set_one_msr(vcpu, reg->index, user_val);
6148 if (get_user(user_nr_regs, &user_list->n))
6149 return -EFAULT;
6151 if (put_user(nr_regs, &user_list->n))
6152 return -EFAULT;
6155 return -E2BIG;
6158 put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0]))
6159 return -EFAULT;
6167 struct kvm_vcpu *vcpu = filp->private_data;
6183 r = -EINVAL;
6188 r = -ENOMEM;
6194 r = -EFAULT;
6201 r = -EINVAL;
6216 r = -EFAULT;
6234 r = -EFAULT;
6237 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
6244 r = -EFAULT;
6248 cpuid_arg->entries);
6255 r = -EFAULT;
6259 cpuid_arg->entries);
6262 r = -EFAULT;
6269 int idx = srcu_read_lock(&vcpu->kvm->srcu);
6271 srcu_read_unlock(&vcpu->kvm->srcu, idx);
6275 int idx = srcu_read_lock(&vcpu->kvm->srcu);
6277 srcu_read_unlock(&vcpu->kvm->srcu, idx);
6290 r = -EFAULT;
6296 r = -EFAULT;
6306 r = -EINVAL;
6309 r = -EFAULT;
6312 idx = srcu_read_lock(&vcpu->kvm->srcu);
6314 srcu_read_unlock(&vcpu->kvm->srcu, idx);
6320 r = -EFAULT;
6329 r = -EFAULT;
6340 r = -EFAULT;
6349 r = -EFAULT;
6365 r = -EFAULT;
6375 r = -EFAULT;
6384 r = -EINVAL;
6385 if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
6389 r = -ENOMEM;
6397 r = -EFAULT;
6404 int size = vcpu->arch.guest_fpu.uabi_size;
6417 int size = vcpu->arch.guest_fpu.uabi_size;
6420 r = -ENOMEM;
6428 r = -EFAULT;
6438 r = -ENOMEM;
6446 r = -EFAULT;
6466 r = -EINVAL;
6468 if (vcpu->arch.guest_tsc_protected)
6486 r = vcpu->arch.virtual_tsc_khz;
6496 r = -EFAULT;
6506 r = -EINVAL;
6507 if (!kvm_x86_ops.nested_ops->get_state)
6510 BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
6511 r = -EFAULT;
6512 if (get_user(user_data_size, &user_kvm_nested_state->size))
6515 r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
6521 if (put_user(r, &user_kvm_nested_state->size))
6522 r = -EFAULT;
6524 r = -E2BIG;
6536 r = -EINVAL;
6537 if (!kvm_x86_ops.nested_ops->set_state)
6540 r = -EFAULT;
6544 r = -EINVAL;
6559 idx = srcu_read_lock(&vcpu->kvm->srcu);
6560 r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
6561 srcu_read_unlock(&vcpu->kvm->srcu, idx);
6573 r = -EFAULT;
6578 r = -EFAULT;
6584 r = -EFAULT;
6592 r = -EINVAL;
6593 if (vcpu->kvm->arch.has_protected_state &&
6594 vcpu->arch.guest_state_protected)
6598 r = -ENOMEM;
6602 r = -EFAULT;
6609 r = -EINVAL;
6610 if (vcpu->kvm->arch.has_protected_state &&
6611 vcpu->arch.guest_state_protected)
6629 r = -ENOTTY;
6635 r = -EINVAL;
6653 if (addr > (unsigned int)(-3 * PAGE_SIZE))
6654 return -EINVAL;
6669 return -EINVAL;
6671 mutex_lock(&kvm->slots_lock);
6674 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
6676 mutex_unlock(&kvm->slots_lock);
6686 * on all VM-Exits, thus we only need to kick running vCPUs to force a
6687 * VM-Exit.
6692 if (!kvm->arch.cpu_dirty_log_size)
6704 if (cap->flags)
6705 return -EINVAL;
6707 switch (cap->cap) {
6709 r = -EINVAL;
6710 if (cap->args[0] & ~kvm_caps.supported_quirks)
6714 kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks;
6718 mutex_lock(&kvm->lock);
6719 r = -EINVAL;
6720 if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
6722 r = -EEXIST;
6725 if (kvm->created_vcpus)
6729 kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
6730 kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
6734 mutex_unlock(&kvm->lock);
6738 r = -EINVAL;
6739 if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
6742 if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
6743 kvm->arch.x2apic_format = true;
6744 if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
6745 kvm->arch.x2apic_broadcast_quirk_disabled = true;
6750 r = -EINVAL;
6751 if (cap->args[0] & ~kvm_get_allowed_disable_exits())
6754 mutex_lock(&kvm->lock);
6755 if (kvm->created_vcpus)
6758 #define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \
6763 (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE |
6767 kvm_disable_exits(kvm, cap->args[0]);
6770 mutex_unlock(&kvm->lock);
6773 kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
6777 kvm->arch.exception_payload_enabled = cap->args[0];
6781 kvm->arch.triple_fault_event = cap->args[0];
6785 r = -EINVAL;
6786 if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK)
6788 kvm->arch.user_space_msr_mask = cap->args[0];
6792 r = -EINVAL;
6793 if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE)
6796 if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) &&
6797 (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
6801 cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
6802 kvm->arch.bus_lock_detection_enabled = true;
6809 r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
6816 kvm->arch.sgx_provisioning_allowed = true;
6818 r = -EINVAL;
6823 r = -EINVAL;
6827 r = kvm_x86_call(vm_copy_enc_context_from)(kvm, cap->args[0]);
6830 r = -EINVAL;
6834 r = kvm_x86_call(vm_move_enc_context_from)(kvm, cap->args[0]);
6837 if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
6838 r = -EINVAL;
6841 kvm->arch.hypercall_exit_enabled = cap->args[0];
6845 r = -EINVAL;
6846 if (cap->args[0] & ~1)
6848 kvm->arch.exit_on_emulation_error = cap->args[0];
6852 r = -EINVAL;
6853 if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK))
6856 mutex_lock(&kvm->lock);
6857 if (!kvm->created_vcpus) {
6858 kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
6861 mutex_unlock(&kvm->lock);
6864 r = -EINVAL;
6865 if (cap->args[0] > KVM_MAX_VCPU_IDS)
6868 mutex_lock(&kvm->lock);
6869 if (kvm->arch.bsp_vcpu_id > cap->args[0]) {
6871 } else if (kvm->arch.max_vcpu_ids == cap->args[0]) {
6873 } else if (!kvm->arch.max_vcpu_ids) {
6874 kvm->arch.max_vcpu_ids = cap->args[0];
6877 mutex_unlock(&kvm->lock);
6880 r = -EINVAL;
6881 if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS)
6885 if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED))
6887 mutex_lock(&kvm->lock);
6888 if (!kvm->created_vcpus) {
6889 kvm->arch.notify_window = cap->args[0] >> 32;
6890 kvm->arch.notify_vmexit_flags = (u32)cap->args[0];
6893 mutex_unlock(&kvm->lock);
6896 r = -EINVAL;
6907 * this must use capable(), not ns_capable().
6910 r = -EPERM;
6914 if (cap->args[0])
6917 mutex_lock(&kvm->lock);
6918 if (!kvm->created_vcpus) {
6919 kvm->arch.disable_nx_huge_pages = true;
6922 mutex_unlock(&kvm->lock);
6925 u64 bus_cycle_ns = cap->args[0];
6932 r = -EINVAL;
6938 mutex_lock(&kvm->lock);
6940 r = -ENXIO;
6941 else if (kvm->created_vcpus)
6942 r = -EINVAL;
6944 kvm->arch.apic_bus_cycle_ns = bus_cycle_ns;
6945 mutex_unlock(&kvm->lock);
6949 r = -EINVAL;
6963 msr_filter->default_allow = default_allow;
6974 for (i = 0; i < msr_filter->count; i++)
6975 kfree(msr_filter->ranges[i].bitmap);
6986 if (!user_range->nmsrs)
6989 if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK)
6990 return -EINVAL;
6992 if (!user_range->flags)
6993 return -EINVAL;
6995 bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
6997 return -EINVAL;
6999 bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
7003 msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
7004 .flags = user_range->flags,
7005 .base = user_range->base,
7006 .nmsrs = user_range->nmsrs,
7010 msr_filter->count++;
7023 if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
7024 return -EINVAL;
7026 for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
7027 empty &= !filter->ranges[i].nmsrs;
7029 default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
7031 return -EINVAL;
7035 return -ENOMEM;
7037 for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
7038 r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
7045 mutex_lock(&kvm->lock);
7046 old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter,
7047 mutex_is_locked(&kvm->lock));
7048 mutex_unlock(&kvm->lock);
7049 synchronize_srcu(&kvm->srcu);
7082 struct kvm *kvm = filp->private_data;
7083 long r = -ENOTTY;
7094 return -EFAULT;
7102 .flags = cr->flags,
7103 .nmsrs = cr->nmsrs,
7104 .base = cr->base,
7105 .bitmap = (__u8 *)(ulong)cr->bitmap,
7152 return -EFAULT;
7159 struct kvm_arch *ka = &kvm->arch;
7164 return -EFAULT;
7171 return -EINVAL;
7179 * in use, we use master_kernel_ns + kvmclock_offset to set
7180 * unsigned 'system_time' so if we use get_kvmclock_ns() (which
7191 data.clock += now_real_ns - data.realtime;
7194 if (ka->use_master_clock)
7195 now_raw_ns = ka->master_kernel_ns;
7198 ka->kvmclock_offset = data.clock - now_raw_ns;
7206 struct kvm_vcpu *vcpu = filp->private_data;
7213 return -ENOIOCTLCMD;
7218 struct kvm *kvm = filp->private_data;
7220 int r = -ENOTTY;
7224 * This union makes it completely explicit to gcc-3.x
7242 mutex_lock(&kvm->lock);
7243 r = -EINVAL;
7244 if (kvm->created_vcpus)
7246 r = -EFAULT;
7251 mutex_unlock(&kvm->lock);
7259 mutex_lock(&kvm->lock);
7261 r = -EEXIST;
7266 * Disallow an in-kernel I/O APIC if the VM has protected EOIs,
7268 * emulate level-triggered interrupts.
7270 r = -ENOTTY;
7271 if (kvm->arch.has_protected_eoi)
7274 r = -EINVAL;
7275 if (kvm->created_vcpus)
7294 /* Write kvm->irq_routing before enabling irqchip_in_kernel. */
7296 kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
7299 mutex_unlock(&kvm->lock);
7306 r = -EFAULT;
7311 mutex_lock(&kvm->lock);
7312 r = -EEXIST;
7313 if (kvm->arch.vpit)
7315 r = -ENOENT;
7318 r = -ENOMEM;
7319 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
7320 if (kvm->arch.vpit)
7323 mutex_unlock(&kvm->lock);
7335 r = -ENXIO;
7341 r = -EFAULT;
7359 r = -ENXIO;
7368 r = -EFAULT;
7371 r = -ENXIO;
7372 if (!kvm->arch.vpit)
7377 r = -EFAULT;
7384 r = -EFAULT;
7387 mutex_lock(&kvm->lock);
7388 r = -ENXIO;
7389 if (!kvm->arch.vpit)
7393 mutex_unlock(&kvm->lock);
7397 r = -ENXIO;
7398 if (!kvm->arch.vpit)
7403 r = -EFAULT;
7410 r = -EFAULT;
7413 mutex_lock(&kvm->lock);
7414 r = -ENXIO;
7415 if (!kvm->arch.vpit)
7419 mutex_unlock(&kvm->lock);
7424 r = -EFAULT;
7427 r = -ENXIO;
7428 if (!kvm->arch.vpit)
7436 mutex_lock(&kvm->lock);
7437 if (kvm->created_vcpus)
7438 r = -EBUSY;
7440 (kvm->arch.max_vcpu_ids && arg > kvm->arch.max_vcpu_ids))
7441 r = -EINVAL;
7443 kvm->arch.bsp_vcpu_id = arg;
7444 mutex_unlock(&kvm->lock);
7449 r = -EFAULT;
7458 r = -EFAULT;
7463 r = -EFAULT;
7469 r = -EFAULT;
7478 r = -EFAULT;
7494 r = -EINVAL;
7504 mutex_lock(&kvm->lock);
7505 if (!kvm->created_vcpus) {
7506 WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
7509 mutex_unlock(&kvm->lock);
7513 r = READ_ONCE(kvm->arch.default_tsc_khz);
7517 r = -ENOTTY;
7526 r = -EFAULT;
7530 r = -ENOTTY;
7540 r = -EFAULT;
7544 r = -ENOTTY;
7555 r = -EFAULT;
7570 return -EFAULT;
7576 r = -ENOTTY;
7636 (msr_index - MSR_IA32_RTIT_ADDR0_A >=
7641 MSR_ARCH_PERFMON_PERFCTR0 + KVM_MAX_NR_GP_COUNTERS - 1:
7642 if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
7647 MSR_ARCH_PERFMON_EVENTSEL0 + KVM_MAX_NR_GP_COUNTERS - 1:
7648 if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
7653 MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_MAX_NR_FIXED_COUNTERS - 1:
7654 if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
7742 !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
7747 len -= n;
7762 !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
7769 len -= n;
7791 struct kvm_mmu *mmu = vcpu->arch.mmu;
7796 /* NPT walks are always user-walks */
7798 t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
7806 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7809 return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
7816 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7820 return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
7828 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7830 return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception);
7837 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7842 gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
7843 unsigned offset = addr & (PAGE_SIZE-1);
7844 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
7856 bytes -= toread;
7870 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7876 gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
7881 offset = addr & (PAGE_SIZE-1);
7883 bytes = (unsigned)PAGE_SIZE - offset;
7929 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7934 gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
7935 unsigned offset = addr & (PAGE_SIZE-1);
7936 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
7947 bytes -= towrite;
8035 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
8045 !permission_fault(vcpu, vcpu->arch.walk_mmu,
8046 vcpu->arch.mmio_access, 0, access))) {
8047 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
8048 (gva & (PAGE_SIZE - 1));
8053 *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
8056 return -1;
8087 if (vcpu->mmio_read_completed) {
8089 vcpu->mmio_fragments[0].gpa, val);
8090 vcpu->mmio_read_completed = 0;
8125 struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
8127 memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
8153 bool write = ops->write;
8155 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8159 * If the GPA is present, use it to avoid the GVA to GPA table walk.
8164 if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
8165 (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
8166 gpa = ctxt->gpa_val;
8174 if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
8180 handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
8185 bytes -= handled;
8188 WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
8189 frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
8190 frag->gpa = gpa;
8191 frag->data = val;
8192 frag->len = bytes;
8206 if (ops->read_write_prepare &&
8207 ops->read_write_prepare(vcpu, val, bytes))
8210 vcpu->mmio_nr_fragments = 0;
8213 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
8216 now = -addr & ~PAGE_MASK;
8223 if (ctxt->mode != X86EMUL_MODE_PROT64)
8226 bytes -= now;
8234 if (!vcpu->mmio_nr_fragments)
8237 gpa = vcpu->mmio_fragments[0].gpa;
8239 vcpu->mmio_needed = 1;
8240 vcpu->mmio_cur_fragment = 0;
8242 vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
8243 vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
8244 vcpu->run->exit_reason = KVM_EXIT_MMIO;
8245 vcpu->run->mmio.phys_addr = gpa;
8247 return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
8287 if (bytes > 8 || (bytes & (bytes - 1)))
8301 page_line_mask = ~(cache_line_size() - 1);
8305 if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
8363 WARN_ON_ONCE(vcpu->arch.pio.count);
8379 memset(data, 0, size * (count - i));
8388 vcpu->arch.pio.port = port;
8389 vcpu->arch.pio.in = in;
8390 vcpu->arch.pio.count = count;
8391 vcpu->arch.pio.size = size;
8394 memset(vcpu->arch.pio_data, 0, size * count);
8396 memcpy(vcpu->arch.pio_data, data, size * count);
8398 vcpu->run->exit_reason = KVM_EXIT_IO;
8399 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
8400 vcpu->run->io.size = size;
8401 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
8402 vcpu->run->io.count = count;
8403 vcpu->run->io.port = port;
8419 int size = vcpu->arch.pio.size;
8420 unsigned int count = vcpu->arch.pio.count;
8421 memcpy(val, vcpu->arch.pio_data, size * count);
8422 trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data);
8423 vcpu->arch.pio.count = 0;
8431 if (vcpu->arch.pio.count) {
8479 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
8480 wbinvd_on_cpus_mask(vcpu->arch.wbinvd_dirty_mask);
8482 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
8516 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
8529 value = vcpu->arch.cr2;
8558 vcpu->arch.cr2 = val;
8571 res = -1;
8632 desc->type = var.type;
8633 desc->s = var.s;
8634 desc->dpl = var.dpl;
8635 desc->p = var.present;
8636 desc->avl = var.avl;
8637 desc->l = var.l;
8638 desc->d = var.db;
8639 desc->g = var.g;
8657 if (desc->g)
8659 var.type = desc->type;
8660 var.dpl = desc->dpl;
8661 var.db = desc->d;
8662 var.s = desc->s;
8663 var.l = desc->l;
8664 var.g = desc->g;
8665 var.avl = desc->avl;
8666 var.present = desc->p;
8724 * Treat emulator accesses to the current shadow stack pointer as host-
8727 * so the index is fully KVM-controlled.
8748 emul_to_vcpu(ctxt)->arch.halt_request = 1;
8756 &ctxt->exception);
8823 *xcr = emul_to_vcpu(ctxt)->arch.xcr0;
8834 struct kvm *kvm = emul_to_vcpu(ctxt)->kvm;
8836 if (!kvm->vm_bugged)
8926 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8928 if (ctxt->exception.vector == PF_VECTOR)
8929 kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
8930 else if (ctxt->exception.error_code_valid)
8931 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
8932 ctxt->exception.error_code);
8934 kvm_queue_exception(vcpu, ctxt->exception.vector);
8947 ctxt->vcpu = vcpu;
8948 ctxt->ops = &emulate_ops;
8949 vcpu->arch.emulate_ctxt = ctxt;
8956 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8961 ctxt->gpa_available = false;
8962 ctxt->eflags = kvm_get_rflags(vcpu);
8963 ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
8965 ctxt->eip = kvm_rip_read(vcpu);
8966 ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
8967 (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
8971 ctxt->interruptibility = 0;
8972 ctxt->have_exception = false;
8973 ctxt->exception.vector = -1;
8974 ctxt->perm_ok = false;
8977 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
8982 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8987 ctxt->op_bytes = 2;
8988 ctxt->ad_bytes = 2;
8989 ctxt->_eip = ctxt->eip + inc_eip;
8995 ctxt->eip = ctxt->_eip;
8996 kvm_rip_write(vcpu, ctxt->eip);
8997 kvm_set_rflags(vcpu, ctxt->eflags);
9005 struct kvm_run *run = vcpu->run;
9018 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
9019 run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
9031 run->emulation_failure.flags = 0;
9034 BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
9035 sizeof(run->emulation_failure.insn_bytes) != 16));
9037 run->emulation_failure.flags |=
9039 run->emulation_failure.insn_size = insn_size;
9040 memset(run->emulation_failure.insn_bytes, 0x90,
9041 sizeof(run->emulation_failure.insn_bytes));
9042 memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
9045 memcpy(&run->internal.data[info_start], info, sizeof(info));
9046 memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
9049 run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata;
9054 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9056 prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
9057 ctxt->fetch.end - ctxt->fetch.data);
9076 struct kvm_run *run = vcpu->run;
9083 run->internal.data[ndata++] = info2;
9084 run->internal.data[ndata++] = reason;
9085 run->internal.data[ndata++] = info1;
9086 run->internal.data[ndata++] = gpa;
9087 run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
9089 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
9090 run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
9091 run->internal.ndata = ndata;
9099 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
9100 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
9101 vcpu->run->internal.ndata = 2;
9102 vcpu->run->internal.data[0] = exit_reason;
9103 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
9109 struct kvm *kvm = vcpu->kvm;
9111 ++vcpu->stat.insn_emulation_fail;
9119 if (kvm->arch.exit_on_emulation_error ||
9148 * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and
9157 * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
9158 * guest to let the CPU re-execute the instruction in the hope that the
9193 struct kvm_run *kvm_run = vcpu->run;
9195 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
9196 kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW;
9197 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
9198 kvm_run->debug.arch.exception = DB_VECTOR;
9199 kvm_run->exit_reason = KVM_EXIT_DEBUG;
9269 if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
9270 (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
9271 struct kvm_run *kvm_run = vcpu->run;
9274 vcpu->arch.guest_debug_dr7,
9275 vcpu->arch.eff_db);
9278 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
9279 kvm_run->debug.arch.pc = eip;
9280 kvm_run->debug.arch.exception = DB_VECTOR;
9281 kvm_run->exit_reason = KVM_EXIT_DEBUG;
9287 if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
9291 vcpu->arch.dr7,
9292 vcpu->arch.db);
9306 switch (ctxt->opcode_len) {
9308 switch (ctxt->b) {
9325 switch (ctxt->b) {
9340 switch (ctxt->b) {
9344 return vector == ctxt->src.val;
9355 * (and wrong) when emulating on an intercepted fault-like exception[*], as
9365 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9373 ++vcpu->stat.insn_emulation;
9383 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9416 * are fault-like and are higher priority than any faults on
9434 if (ctxt->have_exception &&
9437 * #UD should result in just EMULATION_FAILED, and trap-like
9440 WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
9441 exception_type(ctxt->exception.vector) == EXCPT_TRAP);
9457 * use *only* by vendor callbacks for kvm_skip_emulated_instruction().
9459 * injecting single-step #DBs.
9466 if (ctxt->mode != X86EMUL_MODE_PROT64)
9467 ctxt->eip = (u32)ctxt->_eip;
9469 ctxt->eip = ctxt->_eip;
9476 kvm_rip_write(vcpu, ctxt->eip);
9477 if (ctxt->eflags & X86_EFLAGS_RF)
9478 kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
9483 * If emulation was caused by a write-protection #PF on a non-page_table
9495 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
9496 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
9503 ctxt->exception.address = cr2_or_gpa;
9506 if (vcpu->arch.mmu->root_role.direct) {
9507 ctxt->gpa_available = true;
9508 ctxt->gpa_val = cr2_or_gpa;
9512 ctxt->exception.address = 0;
9517 * L2, unless KVM is re-emulating a previously decoded instruction,
9535 if (ctxt->have_exception) {
9536 WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write);
9537 vcpu->mmio_needed = false;
9540 } else if (vcpu->arch.pio.count) {
9541 if (!vcpu->arch.pio.in) {
9542 /* FIXME: return into emulator if single-stepping. */
9543 vcpu->arch.pio.count = 0;
9546 vcpu->arch.complete_userspace_io = complete_emulated_pio;
9549 } else if (vcpu->mmio_needed) {
9550 ++vcpu->stat.mmio_exits;
9552 if (!vcpu->mmio_is_write)
9555 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
9556 } else if (vcpu->arch.complete_userspace_io) {
9567 toggle_interruptibility(vcpu, ctxt->interruptibility);
9568 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
9571 * Note, EXCPT_DB is assumed to be fault-like as the emulator
9573 * of which are fault-like.
9575 if (!ctxt->have_exception ||
9576 exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
9578 if (ctxt->is_branch)
9580 kvm_rip_write(vcpu, ctxt->eip);
9581 if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
9584 __kvm_set_rflags(vcpu, ctxt->eflags);
9593 if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
9596 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
9616 vcpu->arch.pio.count = 0;
9622 vcpu->arch.pio.count = 0;
9624 if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)))
9644 kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
9645 vcpu->arch.complete_userspace_io =
9649 vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
9650 vcpu->arch.complete_userspace_io = complete_fast_pio_out;
9660 BUG_ON(vcpu->arch.pio.count != 1);
9662 if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) {
9663 vcpu->arch.pio.count = 0;
9668 val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
9691 vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
9692 vcpu->arch.complete_userspace_io = complete_fast_pio_in;
9723 khz = freq->new;
9744 /* TSC frequency always matches when on Hyper-V */
9812 if (vcpu->cpu != cpu)
9815 if (vcpu->cpu != raw_smp_processor_id())
9821 if (freq->old < freq->new && send_ipi) {
9844 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
9846 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
9849 for_each_cpu(cpu, freq->policy->cpus)
9877 if (policy->cpuinfo.max_freq)
9878 max_tsc_khz = policy->cpuinfo.max_freq;
9932 * Disable master clock if host does not trust, or does not use,
9936 if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
9949 memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
9959 #include <asm/kvm-x86-ops.h>
9962 kvm_pmu_ops_update(ops->pmu_ops);
9980 return -EIO;
9999 return -EEXIST;
10009 return -EOPNOTSUPP;
10014 return -EOPNOTSUPP;
10027 return -EIO;
10039 return -EIO;
10047 return -ENOMEM;
10072 kvm_init_pmu_capability(ops->pmu_ops);
10079 r = ops->hardware_setup();
10101 if (pi_inject_timer == -1)
10110 kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
10203 return -KVM_EOPNOTSUPP;
10206 * When tsc is in permanent catchup mode guests won't be able to use
10209 if (vcpu->arch.tsc_always_catchup)
10210 return -KVM_EOPNOTSUPP;
10213 return -KVM_EOPNOTSUPP;
10222 if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
10224 ret = -KVM_EFAULT;
10233 * @apicid - apicid of vcpu to be kicked.
10253 return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
10259 ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
10287 set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true);
10289 init_rwsem(&kvm->arch.apicv_update_lock);
10297 vcpu->stat.directed_yield_attempted++;
10303 map = rcu_dereference(vcpu->kvm->arch.apic_map);
10305 if (likely(map) && dest_id <= map->max_apic_id) {
10306 dest_id = array_index_nospec(dest_id, map->max_apic_id + 1);
10307 if (map->phys_map[dest_id])
10308 target = map->phys_map[dest_id]->vcpu;
10313 if (!target || !READ_ONCE(target->ready))
10323 vcpu->stat.directed_yield_successful++;
10331 u64 ret = vcpu->run->hypercall.ret;
10350 ++vcpu->stat.hypercalls;
10363 ret = -KVM_EPERM;
10367 ret = -KVM_ENOSYS;
10377 kvm_pv_kick_cpu_op(vcpu->kvm, a1);
10390 ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
10402 ret = -KVM_ENOSYS;
10403 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE))
10408 ret = -KVM_EINVAL;
10412 vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
10413 vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
10415 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
10416 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
10418 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
10420 vcpu->run->hypercall.ret = 0;
10421 vcpu->run->hypercall.args[0] = gpa;
10422 vcpu->run->hypercall.args[1] = npages;
10423 vcpu->run->hypercall.args[2] = attrs;
10424 vcpu->run->hypercall.flags = 0;
10426 vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;
10428 WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
10429 vcpu->arch.complete_userspace_io = complete_hypercall;
10433 ret = -KVM_ENOSYS;
10438 vcpu->run->hypercall.ret = ret;
10445 if (kvm_xen_hypercall_enabled(vcpu->kvm))
10466 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
10467 ctxt->exception.error_code_valid = false;
10468 ctxt->exception.vector = UD_VECTOR;
10469 ctxt->have_exception = true;
10476 &ctxt->exception);
10481 return vcpu->run->request_interrupt_window &&
10482 likely(!pic_in_kernel(vcpu->kvm));
10485 /* Called within kvm->srcu read side. */
10488 struct kvm_run *kvm_run = vcpu->run;
10490 kvm_run->if_flag = kvm_x86_call(get_if_flag)(vcpu);
10491 kvm_run->cr8 = kvm_get_cr8(vcpu);
10492 kvm_run->apic_base = vcpu->arch.apic_base;
10494 kvm_run->ready_for_interrupt_injection =
10495 pic_in_kernel(vcpu->kvm) ||
10499 kvm_run->flags |= KVM_RUN_X86_SMM;
10501 kvm_run->flags |= KVM_RUN_X86_GUEST_MODE;
10514 if (vcpu->arch.apic->apicv_active)
10517 if (!vcpu->arch.apic->vapic_addr)
10520 max_irr = -1;
10522 if (max_irr != -1)
10534 kvm_x86_ops.nested_ops->triple_fault(vcpu);
10538 return kvm_x86_ops.nested_ops->check_events(vcpu);
10547 * is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do
10550 vcpu->arch.exception.has_error_code &= is_protmode(vcpu);
10552 trace_kvm_inj_exception(vcpu->arch.exception.vector,
10553 vcpu->arch.exception.has_error_code,
10554 vcpu->arch.exception.error_code,
10555 vcpu->arch.exception.injected);
10565 * injected as part of a previous VM-Enter, but weren't successfully delivered
10566 * and need to be re-injected.
10571 * also be able to re-inject NMIs and IRQs in the middle of an instruction.
10572 * I.e. for exceptions and re-injected events, NOT invoking this on instruction
10577 * instruction boundaries for asynchronous events. However, because VM-Exits
10583 * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
10606 * Process nested events first, as nested VM-Exit supersedes event
10607 * re-injection. If there's an event queued for re-injection, it will
10608 * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
10616 * Re-inject exceptions and events *especially* if immediate entry+exit
10620 * Don't re-inject an NMI or interrupt if there is a pending exception.
10629 * as the exception "occurred" before the exit to userspace. Trap-like
10631 * And while fault-like exceptions, e.g. #GP and #PF, are the lowest
10634 * Thus a pending fault-like exception means the fault occurred on the
10638 if (vcpu->arch.exception.injected)
10642 else if (vcpu->arch.nmi_injected)
10644 else if (vcpu->arch.interrupt.injected)
10648 * Exceptions that morph to VM-Exits are handled above, and pending
10649 * exceptions on top of injected exceptions that do not VM-Exit should
10652 WARN_ON_ONCE(vcpu->arch.exception.injected &&
10653 vcpu->arch.exception.pending);
10657 * nested VM-Enter or event re-injection so that a different pending
10660 * Otherwise, continue processing events even if VM-Exit occurred. The
10661 * VM-Exit will have cleared exceptions that were meant for L2, but
10668 * A pending exception VM-Exit should either result in nested VM-Exit
10669 * or force an immediate re-entry and exit to/from L2, and exception
10670 * VM-Exits cannot be injected (flag should _never_ be set).
10672 WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected ||
10673 vcpu->arch.exception_vmexit.pending);
10677 * to re-inject a previous event. See above comments on re-injecting
10682 if (vcpu->arch.exception.pending) {
10684 * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
10685 * value pushed on the stack. Trap-like exception and all #DBs
10686 * leave RF as-is (KVM follows Intel's behavior in this regard;
10691 * fault-like. They do _not_ set RF, a la code breakpoints.
10693 if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT)
10697 if (vcpu->arch.exception.vector == DB_VECTOR) {
10698 kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception);
10699 if (vcpu->arch.dr7 & DR7_GD) {
10700 vcpu->arch.dr7 &= ~DR7_GD;
10707 vcpu->arch.exception.pending = false;
10708 vcpu->arch.exception.injected = true;
10714 if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ)
10719 * due to architectural conditions (e.g. IF=0) a window-open exit
10720 * will re-request KVM_REQ_EVENT. Sometimes however an event is pending
10726 * The kvm_x86_ops hooks communicate this by returning -EBUSY.
10729 if (vcpu->arch.smi_pending) {
10731 -EBUSY;
10735 vcpu->arch.smi_pending = false;
10736 ++vcpu->arch.smi_count;
10744 if (vcpu->arch.nmi_pending) {
10746 -EBUSY;
10750 --vcpu->arch.nmi_pending;
10751 vcpu->arch.nmi_injected = true;
10756 if (vcpu->arch.nmi_pending)
10762 -EBUSY;
10768 if (!WARN_ON_ONCE(irq == -1)) {
10779 kvm_x86_ops.nested_ops->has_events &&
10780 kvm_x86_ops.nested_ops->has_events(vcpu, true))
10785 * is done emulating and should only propagate the to-be-injected event
10787 * infinite loop as KVM will bail from VM-Enter to inject the pending
10795 WARN_ON_ONCE(vcpu->arch.exception.pending ||
10796 vcpu->arch.exception_vmexit.pending);
10800 if (r == -EBUSY) {
10821 if (kvm_x86_call(get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
10828 * tracked in vcpu->arch.nmi_pending.
10831 limit--;
10833 vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
10834 vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
10836 if (vcpu->arch.nmi_pending &&
10838 vcpu->arch.nmi_pending--;
10840 if (vcpu->arch.nmi_pending)
10847 return vcpu->arch.nmi_pending +
10864 struct kvm_lapic *apic = vcpu->arch.apic;
10870 down_read(&vcpu->kvm->arch.apicv_update_lock);
10877 if (apic->apicv_active == activate)
10880 apic->apicv_active = activate;
10890 if (!apic->apicv_active)
10895 up_read(&vcpu->kvm->arch.apicv_update_lock);
10909 * this case so that KVM can use the AVIC doorbell to inject interrupts
10915 if (apic_x2apic_mode(vcpu->arch.apic) &&
10927 lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
10932 old = new = kvm->arch.apicv_inhibit_reasons;
10950 kvm->arch.apicv_inhibit_reasons = new;
10953 int idx = srcu_read_lock(&kvm->srcu);
10956 srcu_read_unlock(&kvm->srcu, idx);
10959 kvm->arch.apicv_inhibit_reasons = new;
10969 down_write(&kvm->arch.apicv_update_lock);
10971 up_write(&kvm->arch.apicv_update_lock);
10980 bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
10981 vcpu->arch.highest_stale_pending_ioapic_eoi = -1;
10985 if (irqchip_split(vcpu->kvm))
10986 kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
10988 else if (ioapic_in_kernel(vcpu->kvm))
10989 kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
10993 vcpu->arch.load_eoi_exitmap_pending = true;
11000 if (!kvm_apic_hw_enabled(vcpu->arch.apic))
11008 vcpu->arch.ioapic_handled_vectors,
11009 to_hv_synic(vcpu)->vec_bitmap, 256);
11015 vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
11032 * Called within kvm->srcu read side.
11050 r = -EIO;
11060 if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
11070 kvm_update_masterclock(vcpu->kvm);
11094 * Fall back to a "full" guest flush if Hyper-V's precise
11095 * flushing fails. Note, Hyper-V's flushing is per-vCPU, but
11106 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
11112 kvm_x86_ops.nested_ops->triple_fault(vcpu);
11115 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
11116 vcpu->mmio_needed = 0;
11123 vcpu->arch.apf.halted = true;
11140 BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
11141 if (test_bit(vcpu->arch.pending_ioapic_eoi,
11142 vcpu->arch.ioapic_handled_vectors)) {
11143 vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
11144 vcpu->run->eoi.vector =
11145 vcpu->arch.pending_ioapic_eoi;
11158 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
11159 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
11160 vcpu->run->system_event.ndata = 0;
11165 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
11166 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
11167 vcpu->run->system_event.ndata = 0;
11174 vcpu->run->exit_reason = KVM_EXIT_HYPERV;
11175 vcpu->run->hyperv = hv_vcpu->exit;
11182 * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
11183 * depend on the guest clock being up-to-date
11201 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) {
11210 ++vcpu->stat.req_event;
11216 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
11251 /* Store vcpu->apicv_active before vcpu->mode. */
11252 smp_store_release(&vcpu->mode, IN_GUEST_MODE);
11257 * 1) We should set ->mode before checking ->requests. Please see
11260 * 2) For APICv, we should set ->mode before checking PID.ON. This
11281 vcpu->mode = OUTSIDE_GUEST_MODE;
11300 if (vcpu->arch.guest_fpu.xfd_err)
11301 wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
11305 if (unlikely(vcpu->arch.switch_db_regs &&
11306 !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) {
11308 set_debugreg(vcpu->arch.eff_db[0], 0);
11309 set_debugreg(vcpu->arch.eff_db[1], 1);
11310 set_debugreg(vcpu->arch.eff_db[2], 2);
11311 set_debugreg(vcpu->arch.eff_db[3], 3);
11313 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
11322 * vendor code if any host-owned bits were changed, e.g. so that the
11326 if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
11327 !vcpu->arch.guest_state_protected)
11329 vcpu->arch.host_debugctl = debug_ctl;
11335 * of flows where non-KVM code can run with guest state loaded.
11343 * per-VM state, and responding vCPUs must wait for the update
11363 /* Note, VM-Exits that go down the "slow" path are accounted below. */
11364 ++vcpu->stat.exits;
11375 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
11376 WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
11377 WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH);
11393 vcpu->arch.last_vmentry_cpu = vcpu->cpu;
11394 vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
11396 vcpu->mode = OUTSIDE_GUEST_MODE;
11403 * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
11406 if (vcpu->arch.xfd_no_write_intercept)
11411 if (vcpu->arch.guest_fpu.xfd_err)
11425 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
11432 ++vcpu->stat.exits;
11441 * acceptable for all known use cases.
11460 !vcpu->arch.guest_state_protected)) {
11465 if (unlikely(vcpu->arch.tsc_always_catchup))
11468 if (vcpu->arch.apic_attention)
11481 if (unlikely(vcpu->arch.apic_attention))
11489 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
11490 !vcpu->arch.apf.halted);
11495 if (!list_empty_careful(&vcpu->async_pf.done))
11506 (vcpu->arch.nmi_pending &&
11512 (vcpu->arch.smi_pending &&
11530 kvm_x86_ops.nested_ops->has_events &&
11531 kvm_x86_ops.nested_ops->has_events(vcpu, false))
11543 return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted ||
11547 /* Called within kvm->srcu read side. */
11554 * Switch to the software timer before halt-polling/blocking as
11557 * Switch before halt-polling so that KVM recognizes an expired
11565 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
11586 * state field (AMD does not have a similar field and a VM-Exit always
11592 WARN_ON_ONCE(r == -EBUSY);
11599 switch(vcpu->arch.mp_state) {
11605 vcpu->arch.apf.halted = false;
11616 /* Called within kvm->srcu read side. */
11621 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
11627 * use a stale page translation. Assume that any code after
11630 vcpu->arch.at_instruction_boundary = false;
11650 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
11651 ++vcpu->stat.request_irq_exits;
11671 * local APIC is in-kernel, the run loop will detect the non-runnable
11676 ++vcpu->stat.halt_exits;
11678 if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted)
11683 vcpu->run->exit_reason = reason;
11698 * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
11734 return vcpu->arch.preempted_in_kernel;
11739 if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
11759 BUG_ON(!vcpu->arch.pio.count);
11784 struct kvm_run *run = vcpu->run;
11788 BUG_ON(!vcpu->mmio_needed);
11791 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
11792 len = min(8u, frag->len);
11793 if (!vcpu->mmio_is_write)
11794 memcpy(frag->data, run->mmio.data, len);
11796 if (frag->len <= 8) {
11799 vcpu->mmio_cur_fragment++;
11802 frag->data += len;
11803 frag->gpa += len;
11804 frag->len -= len;
11807 if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
11808 vcpu->mmio_needed = 0;
11810 /* FIXME: return into emulator if single-stepping. */
11811 if (vcpu->mmio_is_write)
11813 vcpu->mmio_read_completed = 1;
11817 run->exit_reason = KVM_EXIT_MMIO;
11818 run->mmio.phys_addr = frag->gpa;
11819 if (vcpu->mmio_is_write)
11820 memcpy(run->mmio.data, frag->data, min(8u, frag->len));
11821 run->mmio.len = min(8u, frag->len);
11822 run->mmio.is_write = vcpu->mmio_is_write;
11823 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
11830 if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
11833 /* Exclude PKRU, it's restored separately immediately after VM-Exit. */
11834 fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
11841 if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
11844 fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
11845 ++vcpu->stat.fpu_reload;
11852 * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and
11857 if (WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED))
11858 return -EINVAL;
11864 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED &&
11866 return -EINVAL;
11873 struct kvm_queued_exception *ex = &vcpu->arch.exception;
11874 struct kvm_run *kvm_run = vcpu->run;
11878 r = kvm_mmu_post_init_vm(vcpu->kvm);
11884 kvm_run->flags = 0;
11888 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
11889 if (!vcpu->wants_to_run) {
11890 r = -EINTR;
11912 r = -EAGAIN;
11914 r = -EINTR;
11915 kvm_run->exit_reason = KVM_EXIT_INTR;
11916 ++vcpu->stat.signal_exits;
11921 sync_valid_fields = kvm_sync_valid_fields(vcpu->kvm);
11922 if ((kvm_run->kvm_valid_regs & ~sync_valid_fields) ||
11923 (kvm_run->kvm_dirty_regs & ~sync_valid_fields)) {
11924 r = -EINVAL;
11928 if (kvm_run->kvm_dirty_regs) {
11934 /* re-sync apic's tpr */
11936 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
11937 r = -EINVAL;
11944 * a pending VM-Exit if L1 wants to intercept the exception.
11946 if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) &&
11947 kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector,
11948 ex->error_code)) {
11949 kvm_queue_exception_vmexit(vcpu, ex->vector,
11950 ex->has_error_code, ex->error_code,
11951 ex->has_payload, ex->payload);
11952 ex->injected = false;
11953 ex->pending = false;
11955 vcpu->arch.exception_from_userspace = false;
11957 if (unlikely(vcpu->arch.complete_userspace_io)) {
11958 int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
11959 vcpu->arch.complete_userspace_io = NULL;
11964 WARN_ON_ONCE(vcpu->arch.pio.count);
11965 WARN_ON_ONCE(vcpu->mmio_needed);
11968 if (!vcpu->wants_to_run) {
11969 r = -EINTR;
11981 if (kvm_run->kvm_valid_regs && likely(!vcpu->arch.guest_state_protected))
11993 if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
12001 emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
12002 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
12004 regs->rax = kvm_rax_read(vcpu);
12005 regs->rbx = kvm_rbx_read(vcpu);
12006 regs->rcx = kvm_rcx_read(vcpu);
12007 regs->rdx = kvm_rdx_read(vcpu);
12008 regs->rsi = kvm_rsi_read(vcpu);
12009 regs->rdi = kvm_rdi_read(vcpu);
12010 regs->rsp = kvm_rsp_read(vcpu);
12011 regs->rbp = kvm_rbp_read(vcpu);
12013 regs->r8 = kvm_r8_read(vcpu);
12014 regs->r9 = kvm_r9_read(vcpu);
12015 regs->r10 = kvm_r10_read(vcpu);
12016 regs->r11 = kvm_r11_read(vcpu);
12017 regs->r12 = kvm_r12_read(vcpu);
12018 regs->r13 = kvm_r13_read(vcpu);
12019 regs->r14 = kvm_r14_read(vcpu);
12020 regs->r15 = kvm_r15_read(vcpu);
12023 regs->rip = kvm_rip_read(vcpu);
12024 regs->rflags = kvm_get_rflags(vcpu);
12029 if (vcpu->kvm->arch.has_protected_state &&
12030 vcpu->arch.guest_state_protected)
12031 return -EINVAL;
12041 vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
12042 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
12044 kvm_rax_write(vcpu, regs->rax);
12045 kvm_rbx_write(vcpu, regs->rbx);
12046 kvm_rcx_write(vcpu, regs->rcx);
12047 kvm_rdx_write(vcpu, regs->rdx);
12048 kvm_rsi_write(vcpu, regs->rsi);
12049 kvm_rdi_write(vcpu, regs->rdi);
12050 kvm_rsp_write(vcpu, regs->rsp);
12051 kvm_rbp_write(vcpu, regs->rbp);
12053 kvm_r8_write(vcpu, regs->r8);
12054 kvm_r9_write(vcpu, regs->r9);
12055 kvm_r10_write(vcpu, regs->r10);
12056 kvm_r11_write(vcpu, regs->r11);
12057 kvm_r12_write(vcpu, regs->r12);
12058 kvm_r13_write(vcpu, regs->r13);
12059 kvm_r14_write(vcpu, regs->r14);
12060 kvm_r15_write(vcpu, regs->r15);
12063 kvm_rip_write(vcpu, regs->rip);
12064 kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
12066 vcpu->arch.exception.pending = false;
12067 vcpu->arch.exception_vmexit.pending = false;
12074 if (vcpu->kvm->arch.has_protected_state &&
12075 vcpu->arch.guest_state_protected)
12076 return -EINVAL;
12088 if (vcpu->arch.guest_state_protected)
12091 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
12092 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
12093 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
12094 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
12095 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
12096 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
12098 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
12099 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
12102 sregs->idt.limit = dt.size;
12103 sregs->idt.base = dt.address;
12105 sregs->gdt.limit = dt.size;
12106 sregs->gdt.base = dt.address;
12108 sregs->cr2 = vcpu->arch.cr2;
12109 sregs->cr3 = kvm_read_cr3(vcpu);
12112 sregs->cr0 = kvm_read_cr0(vcpu);
12113 sregs->cr4 = kvm_read_cr4(vcpu);
12114 sregs->cr8 = kvm_get_cr8(vcpu);
12115 sregs->efer = vcpu->arch.efer;
12116 sregs->apic_base = vcpu->arch.apic_base;
12123 if (vcpu->arch.guest_state_protected)
12126 if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
12127 set_bit(vcpu->arch.interrupt.nr,
12128 (unsigned long *)sregs->interrupt_bitmap);
12137 if (vcpu->arch.guest_state_protected)
12142 sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
12143 sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
12150 if (vcpu->kvm->arch.has_protected_state &&
12151 vcpu->arch.guest_state_protected)
12152 return -EINVAL;
12173 if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
12174 vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
12175 vcpu->arch.pv.pv_unhalted)
12176 mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
12178 mp_state->mp_state = vcpu->arch.mp_state;
12189 int ret = -EINVAL;
12193 switch (mp_state->mp_state) {
12212 * leaves the vCPU in INIT_RECIEVED (Wait-For-SIPI) and pends the SIPI.
12215 if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
12216 mp_state->mp_state = KVM_MP_STATE_INIT_RECEIVED;
12217 set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
12220 kvm_set_mp_state(vcpu, mp_state->mp_state);
12232 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
12239 * Check both User and Supervisor on task switches as inter-
12263 if (ret || vcpu->mmio_needed)
12266 kvm_rip_write(vcpu, ctxt->eip);
12267 kvm_set_rflags(vcpu, ctxt->eflags);
12271 vcpu->mmio_needed = false;
12272 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
12273 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
12274 vcpu->run->internal.ndata = 0;
12281 if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
12284 * 64-bit mode (though maybe in a 32-bit code segment).
12287 if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
12289 if (!kvm_vcpu_is_legal_cr3(vcpu, sregs->cr3))
12293 * Not in 64-bit mode: EFER.LMA is clear and the code
12294 * segment cannot be 64-bit.
12296 if (sregs->efer & EFER_LMA || sregs->cs.l)
12300 return kvm_is_valid_cr4(vcpu, sregs->cr4) &&
12301 kvm_is_valid_cr0(vcpu, sregs->cr0);
12311 return -EINVAL;
12313 if (kvm_apic_set_base(vcpu, sregs->apic_base, true))
12314 return -EINVAL;
12316 if (vcpu->arch.guest_state_protected)
12319 dt.size = sregs->idt.limit;
12320 dt.address = sregs->idt.base;
12322 dt.size = sregs->gdt.limit;
12323 dt.address = sregs->gdt.base;
12326 vcpu->arch.cr2 = sregs->cr2;
12327 *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
12328 vcpu->arch.cr3 = sregs->cr3;
12330 kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3);
12332 kvm_set_cr8(vcpu, sregs->cr8);
12334 *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
12335 kvm_x86_call(set_efer)(vcpu, sregs->efer);
12337 *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
12338 kvm_x86_call(set_cr0)(vcpu, sregs->cr0);
12340 *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
12341 kvm_x86_call(set_cr4)(vcpu, sregs->cr4);
12344 idx = srcu_read_lock(&vcpu->kvm->srcu);
12349 srcu_read_unlock(&vcpu->kvm->srcu, idx);
12352 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
12353 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
12354 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
12355 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
12356 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
12357 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
12359 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
12360 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
12366 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
12389 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
12402 bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
12403 bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
12404 !(sregs2->efer & EFER_LMA);
12407 if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
12408 return -EINVAL;
12410 if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
12411 return -EINVAL;
12420 kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
12424 vcpu->arch.pdptrs_from_userspace = true;
12438 if (vcpu->kvm->arch.has_protected_state &&
12439 vcpu->arch.guest_state_protected)
12440 return -EINVAL;
12457 down_write(&kvm->arch.apicv_update_lock);
12460 if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) {
12466 up_write(&kvm->arch.apicv_update_lock);
12475 if (vcpu->arch.guest_state_protected)
12476 return -EINVAL;
12480 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
12481 r = -EBUSY;
12484 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
12496 vcpu->guest_debug = dbg->control;
12497 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
12498 vcpu->guest_debug = 0;
12500 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
12502 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
12503 vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
12506 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
12510 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
12511 vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu);
12521 kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm);
12536 unsigned long vaddr = tr->linear_address;
12542 idx = srcu_read_lock(&vcpu->kvm->srcu);
12544 srcu_read_unlock(&vcpu->kvm->srcu, idx);
12545 tr->physical_address = gpa;
12546 tr->valid = gpa != INVALID_GPA;
12547 tr->writeable = 1;
12548 tr->usermode = 0;
12558 if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
12559 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
12563 fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
12564 memcpy(fpu->fpr, fxsave->st_space, 128);
12565 fpu->fcw = fxsave->cwd;
12566 fpu->fsw = fxsave->swd;
12567 fpu->ftwx = fxsave->twd;
12568 fpu->last_opcode = fxsave->fop;
12569 fpu->last_ip = fxsave->rip;
12570 fpu->last_dp = fxsave->rdp;
12571 memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
12581 if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
12582 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
12586 fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
12588 memcpy(fxsave->st_space, fpu->fpr, 128);
12589 fxsave->cwd = fpu->fcw;
12590 fxsave->swd = fpu->fsw;
12591 fxsave->twd = fpu->ftwx;
12592 fxsave->fop = fpu->last_opcode;
12593 fxsave->rip = fpu->last_ip;
12594 fxsave->rdp = fpu->last_dp;
12595 memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
12605 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
12606 __get_regs(vcpu, &vcpu->run->s.regs.regs);
12608 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
12609 __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
12611 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
12613 vcpu, &vcpu->run->s.regs.events);
12618 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
12619 __set_regs(vcpu, &vcpu->run->s.regs.regs);
12620 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
12623 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
12624 struct kvm_sregs sregs = vcpu->run->s.regs.sregs;
12627 return -EINVAL;
12629 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
12632 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
12633 struct kvm_vcpu_events events = vcpu->run->s.regs.events;
12636 return -EINVAL;
12638 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
12646 if (kvm_check_tsc_unstable() && kvm->created_vcpus)
12650 if (!kvm->arch.max_vcpu_ids)
12651 kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
12653 if (id >= kvm->arch.max_vcpu_ids)
12654 return -EINVAL;
12664 vcpu->arch.last_vmentry_cpu = -1;
12665 vcpu->arch.regs_avail = ~0;
12666 vcpu->arch.regs_dirty = ~0;
12668 kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
12670 if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
12683 r = -ENOMEM;
12688 vcpu->arch.pio_data = page_address(page);
12690 vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
12692 vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
12694 if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
12696 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
12698 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
12705 if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
12712 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
12713 vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
12714 vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
12715 vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
12719 vcpu->arch.pending_external_vector = -1;
12720 vcpu->arch.preempted_in_kernel = false;
12723 vcpu->arch.hv_root_tdp = INVALID_PAGE;
12733 kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
12740 fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
12742 kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
12744 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
12746 kfree(vcpu->arch.mce_banks);
12747 kfree(vcpu->arch.mci_ctl2_banks);
12748 free_page((unsigned long)vcpu->arch.pio_data);
12758 if (mutex_lock_killable(&vcpu->mutex))
12765 vcpu->arch.msr_kvm_poll_control = 1;
12767 mutex_unlock(&vcpu->mutex);
12784 kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
12785 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
12786 fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
12791 kfree(vcpu->arch.mce_banks);
12792 kfree(vcpu->arch.mci_ctl2_banks);
12794 idx = srcu_read_lock(&vcpu->kvm->srcu);
12796 srcu_read_unlock(&vcpu->kvm->srcu, idx);
12797 free_page((unsigned long)vcpu->arch.pio_data);
12798 kvfree(vcpu->arch.cpuid_entries);
12803 struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
12834 * doesn't match the loaded/in-use state of the FPU, as KVM_RUN is the
12838 WARN_ON_ONCE(vcpu->wants_to_run != fpstate->in_use);
12839 fpu_in_use = fpstate->in_use;
12855 * Several of the "set" flows, e.g. ->set_cr0(), read other registers
12858 * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel
12865 * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's
12876 vcpu->arch.hflags = 0;
12878 vcpu->arch.smi_pending = 0;
12879 vcpu->arch.smi_count = 0;
12880 atomic_set(&vcpu->arch.nmi_queued, 0);
12881 vcpu->arch.nmi_pending = 0;
12882 vcpu->arch.nmi_injected = false;
12886 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
12888 vcpu->arch.dr6 = DR6_ACTIVE_LOW;
12889 vcpu->arch.dr7 = DR7_FIXED_1;
12892 vcpu->arch.cr2 = 0;
12895 vcpu->arch.apf.msr_en_val = 0;
12896 vcpu->arch.apf.msr_int_val = 0;
12897 vcpu->arch.st.msr_val = 0;
12903 vcpu->arch.apf.halted = false;
12908 vcpu->arch.smbase = 0x30000;
12910 vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
12912 vcpu->arch.msr_misc_features_enables = 0;
12913 vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
12921 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
12932 kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
12939 vcpu->arch.cr3 = 0;
12961 * which PCIDs have to be flushed. However, CR0.WP and the paging-related
13031 if (!stable && vcpu->cpu == smp_processor_id())
13033 if (stable && vcpu->arch.last_host_tsc > local_tsc) {
13035 if (vcpu->arch.last_host_tsc > max_tsc)
13036 max_tsc = vcpu->arch.last_host_tsc;
13066 * N.B. - this code below runs only on platforms with reliable TSC,
13080 u64 delta_cyc = max_tsc - local_tsc;
13082 kvm->arch.backwards_tsc_observed = true;
13084 vcpu->arch.tsc_offset_adjustment += delta_cyc;
13085 vcpu->arch.last_host_tsc = local_tsc;
13095 kvm->arch.last_tsc_nsec = 0;
13096 kvm->arch.last_tsc_write = 0;
13108 * Leave the user-return notifiers as-is when disabling virtualization
13110 * pin kvm.ko (if it's a module) to defend against use-after-free (in
13113 * could be actively modifying user-return MSR state when the IPI to
13125 return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
13131 return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
13137 kfree(kvm->arch.hv_pa_pg);
13149 return -EINVAL;
13151 kvm->arch.vm_type = type;
13152 kvm->arch.has_private_mem =
13155 kvm->arch.pre_fault_allowed =
13157 kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks;
13171 atomic_set(&kvm->arch.noncoherent_dma_count, 0);
13173 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
13174 mutex_init(&kvm->arch.apic_map_lock);
13175 seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
13176 kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
13178 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
13180 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
13182 kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
13183 kvm->arch.apic_bus_cycle_ns = APIC_BUS_CYCLE_NS_DEFAULT;
13184 kvm->arch.guest_can_read_msr_platform_info = true;
13185 kvm->arch.enable_pmu = enable_pmu;
13188 spin_lock_init(&kvm->arch.hv_root_tdp_lock);
13189 kvm->arch.hv_root_tdp = INVALID_PAGE;
13203 once_init(&kvm->arch.nx_once);
13227 * -errno: on error
13229 * The caller should always use IS_ERR() to check the return value
13230 * before use. Note, the KVM internal memory slots are guaranteed to
13232 * GPA->HVA translation will not change. However, the HVA is a user
13244 lockdep_assert_held(&kvm->slots_lock);
13247 return ERR_PTR_USR(-EINVAL);
13251 if (slot && slot->npages)
13252 return ERR_PTR_USR(-EEXIST);
13263 if (!slot || !slot->npages)
13266 old_npages = slot->npages;
13267 hva = slot->userspace_addr;
13295 * is unsafe, i.e. will lead to use-after-free. The PIT also needs to
13308 if (current->mm == kvm->mm) {
13314 mutex_lock(&kvm->slots_lock);
13320 mutex_unlock(&kvm->slots_lock);
13323 kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
13328 kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
13329 kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
13342 vfree(slot->arch.rmap[i]);
13343 slot->arch.rmap[i] = NULL;
13354 vfree(slot->arch.lpage_info[i - 1]);
13355 slot->arch.lpage_info[i - 1] = NULL;
13363 const int sz = sizeof(*slot->arch.rmap[0]);
13370 if (slot->arch.rmap[i])
13373 slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
13374 if (!slot->arch.rmap[i]) {
13376 return -ENOMEM;
13386 unsigned long npages = slot->npages;
13394 memset(&slot->arch, 0, sizeof(slot->arch));
13414 slot->arch.lpage_info[i - 1] = linfo;
13416 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
13418 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
13419 linfo[lpages - 1].disallow_lpage = 1;
13420 ugfn = slot->userspace_addr >> PAGE_SHIFT;
13425 if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
13446 vfree(slot->arch.lpage_info[i - 1]);
13447 slot->arch.lpage_info[i - 1] = NULL;
13449 return -ENOMEM;
13458 * memslots->generation has been incremented.
13463 /* Force re-initialization of steal_time cache */
13475 * trackers attached to the VM, i.e. if KVMGT is in use.
13478 return -EINVAL;
13481 if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
13482 return -EINVAL;
13484 if (kvm_is_gfn_alias(kvm, new->base_gfn + new->npages - 1))
13485 return -EINVAL;
13491 memcpy(&new->arch, &old->arch, sizeof(old->arch));
13493 return -EIO;
13503 if (!kvm->arch.cpu_dirty_log_size)
13506 nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging);
13516 u32 old_flags = old ? old->flags : 0;
13517 u32 new_flags = new ? new->flags : 0;
13537 * CREATE: No shadow pages exist, thus nothing to write-protect
13546 * READONLY and non-flags changes were filtered out above, and the only
13566 * Initially-all-set does not require write protecting any page,
13575 if (kvm->arch.cpu_dirty_log_size) {
13591 * write-protected before returning to userspace, i.e. before
13598 * Specifically, KVM also write-protects guest page tables to
13607 * To handle these scenarios, KVM uses a separate software-only
13608 * bit (MMU-writable) to track if a SPTE is !writable due to
13609 * a guest page table being write-protected (KVM clears the
13610 * MMU-writable flag when write-protecting for shadow paging).
13612 * The use of MMU-writable is also the primary motivation for
13615 * !MMU-writable SPTE, KVM must flush if it encounters any
13616 * MMU-writable SPTE regardless of whether the actual hardware
13619 * write access" helpers to ignore MMU-writable entirely.
13622 * access-tracked SPTEs is particularly relevant).
13636 if (!kvm->arch.n_requested_mmu_pages &&
13640 nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO;
13656 if (vcpu->arch.guest_state_protected)
13666 if (vcpu->arch.guest_state_protected)
13685 if (vcpu->arch.guest_state_protected)
13706 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
13714 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
13715 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
13736 return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
13743 while (vcpu->arch.apf.gfns[key] != ~0)
13746 vcpu->arch.apf.gfns[key] = gfn;
13755 (vcpu->arch.apf.gfns[key] != gfn &&
13756 vcpu->arch.apf.gfns[key] != ~0); i++)
13764 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
13773 if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
13777 vcpu->arch.apf.gfns[i] = ~0;
13780 if (vcpu->arch.apf.gfns[j] == ~0)
13782 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
13789 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
13798 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
13806 return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
13815 if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
13828 if (!vcpu->arch.apf.send_always &&
13829 (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu)))
13837 return vcpu->arch.apf.delivery_as_pf_vmexit;
13855 if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
13859 * If interrupts are off we cannot even use an artificial
13870 trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
13871 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
13879 fault.address = work->arch.token;
13902 .vector = vcpu->arch.apf.vec
13905 if (work->wakeup_all)
13906 work->arch.token = ~0; /* broadcast wakeup */
13908 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
13909 trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
13911 if ((work->wakeup_all || work->notpresent_injected) &&
13913 !apf_put_user_ready(vcpu, work->arch.token)) {
13914 WRITE_ONCE(vcpu->arch.apf.pageready_pending, true);
13918 vcpu->arch.apf.halted = false;
13929 if (!READ_ONCE(vcpu->arch.apf.pageready_pending))
13944 * Non-coherent DMA assignment and de-assignment may affect whether or
13947 * (or last) non-coherent device is (un)registered to so that new SPTEs
13958 if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1)
13964 if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count))
13970 return atomic_read(&kvm->arch.noncoherent_dma_count);
13976 return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
14033 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
14039 mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) {
14041 * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
14052 vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
14065 if (KVM_BUG_ON(!e, vcpu->kvm))
14066 return -EIO;
14076 * doesn't seem to be a real use-case behind such requests, just return
14132 * page tables, so a non-global flush just degenerates to a
14151 struct kvm_run *run = vcpu->run;
14155 BUG_ON(!vcpu->mmio_needed);
14158 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
14159 len = min(8u, frag->len);
14160 if (!vcpu->mmio_is_write)
14161 memcpy(frag->data, run->mmio.data, len);
14163 if (frag->len <= 8) {
14166 vcpu->mmio_cur_fragment++;
14169 frag->data += len;
14170 frag->gpa += len;
14171 frag->len -= len;
14174 if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
14175 vcpu->mmio_needed = 0;
14183 run->mmio.phys_addr = frag->gpa;
14184 run->mmio.len = min(8u, frag->len);
14185 run->mmio.is_write = vcpu->mmio_is_write;
14186 if (run->mmio.is_write)
14187 memcpy(run->mmio.data, frag->data, min(8u, frag->len));
14188 run->exit_reason = KVM_EXIT_MMIO;
14190 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
14202 return -EINVAL;
14208 bytes -= handled;
14213 frag = vcpu->mmio_fragments;
14214 vcpu->mmio_nr_fragments = 1;
14215 frag->len = bytes;
14216 frag->gpa = gpa;
14217 frag->data = data;
14219 vcpu->mmio_needed = 1;
14220 vcpu->mmio_cur_fragment = 0;
14222 vcpu->run->mmio.phys_addr = gpa;
14223 vcpu->run->mmio.len = min(8u, frag->len);
14224 vcpu->run->mmio.is_write = 1;
14225 memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
14226 vcpu->run->exit_reason = KVM_EXIT_MMIO;
14228 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
14241 return -EINVAL;
14247 bytes -= handled;
14252 frag = vcpu->mmio_fragments;
14253 vcpu->mmio_nr_fragments = 1;
14254 frag->len = bytes;
14255 frag->gpa = gpa;
14256 frag->data = data;
14258 vcpu->mmio_needed = 1;
14259 vcpu->mmio_cur_fragment = 0;
14261 vcpu->run->mmio.phys_addr = gpa;
14262 vcpu->run->mmio.len = min(8u, frag->len);
14263 vcpu->run->mmio.is_write = 0;
14264 vcpu->run->exit_reason = KVM_EXIT_MMIO;
14266 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
14274 vcpu->arch.sev_pio_count -= count;
14275 vcpu->arch.sev_pio_data += count * size;
14283 int size = vcpu->arch.pio.size;
14284 int port = vcpu->arch.pio.port;
14286 vcpu->arch.pio.count = 0;
14287 if (vcpu->arch.sev_pio_count)
14297 min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
14298 int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
14306 if (!vcpu->arch.sev_pio_count)
14310 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs;
14319 unsigned count = vcpu->arch.pio.count;
14320 int size = vcpu->arch.pio.size;
14321 int port = vcpu->arch.pio.port;
14323 complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
14325 if (vcpu->arch.sev_pio_count)
14335 min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
14336 if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count))
14341 if (!vcpu->arch.sev_pio_count)
14345 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
14353 vcpu->arch.sev_pio_data = data;
14354 vcpu->arch.sev_pio_count = count;