xref: /linux/arch/x86/kvm/lapic.c (revision 9fd2da71c301184d98fe37674ca8d017d1ce6600)
1 // SPDX-License-Identifier: GPL-2.0-only
2 
3 /*
4  * Local APIC virtualization
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright (C) 2007 Novell
8  * Copyright (C) 2007 Intel
9  * Copyright 2009 Red Hat, Inc. and/or its affiliates.
10  *
11  * Authors:
12  *   Dor Laor <dor.laor@qumranet.com>
13  *   Gregory Haskins <ghaskins@novell.com>
14  *   Yaozu (Eddie) Dong <eddie.dong@intel.com>
15  *
16  * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
17  */
18 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
19 
20 #include <linux/kvm_host.h>
21 #include <linux/kvm.h>
22 #include <linux/mm.h>
23 #include <linux/highmem.h>
24 #include <linux/smp.h>
25 #include <linux/hrtimer.h>
26 #include <linux/io.h>
27 #include <linux/export.h>
28 #include <linux/math64.h>
29 #include <linux/slab.h>
30 #include <asm/apic.h>
31 #include <asm/processor.h>
32 #include <asm/mce.h>
33 #include <asm/msr.h>
34 #include <asm/page.h>
35 #include <asm/current.h>
36 #include <asm/apicdef.h>
37 #include <asm/delay.h>
38 #include <linux/atomic.h>
39 #include <linux/jump_label.h>
40 #include "kvm_cache_regs.h"
41 #include "irq.h"
42 #include "ioapic.h"
43 #include "trace.h"
44 #include "x86.h"
45 #include "xen.h"
46 #include "cpuid.h"
47 #include "hyperv.h"
48 #include "smm.h"
49 
50 #ifndef CONFIG_X86_64
51 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
52 #else
53 #define mod_64(x, y) ((x) % (y))
54 #endif
55 
56 /* 14 is the version for Xeon and Pentium 8.4.8*/
57 #define APIC_VERSION			0x14UL
58 #define LAPIC_MMIO_LENGTH		(1 << 12)
59 
60 /*
61  * Enable local APIC timer advancement (tscdeadline mode only) with adaptive
62  * tuning.  When enabled, KVM programs the host timer event to fire early, i.e.
63  * before the deadline expires, to account for the delay between taking the
64  * VM-Exit (to inject the guest event) and the subsequent VM-Enter to resume
65  * the guest, i.e. so that the interrupt arrives in the guest with minimal
66  * latency relative to the deadline programmed by the guest.
67  */
68 static bool lapic_timer_advance __read_mostly = true;
69 module_param(lapic_timer_advance, bool, 0444);
70 
71 #define LAPIC_TIMER_ADVANCE_ADJUST_MIN	100	/* clock cycles */
72 #define LAPIC_TIMER_ADVANCE_ADJUST_MAX	10000	/* clock cycles */
73 #define LAPIC_TIMER_ADVANCE_NS_INIT	1000
74 #define LAPIC_TIMER_ADVANCE_NS_MAX     5000
75 /* step-by-step approximation to mitigate fluctuation */
76 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
77 static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
78 static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data);
79 
80 static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
81 {
82 	apic_set_reg(apic->regs, reg_off, val);
83 }
84 
85 static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg)
86 {
87 	return apic_get_reg64(apic->regs, reg);
88 }
89 
90 static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic,
91 						int reg, u64 val)
92 {
93 	apic_set_reg64(apic->regs, reg, val);
94 }
95 
96 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
97 {
98 	struct kvm_lapic *apic = vcpu->arch.apic;
99 
100 	return apic_test_vector(vector, apic->regs + APIC_ISR) ||
101 		apic_test_vector(vector, apic->regs + APIC_IRR);
102 }
103 
104 __read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
105 EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
106 
107 __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ);
108 __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ);
109 
110 static inline int apic_enabled(struct kvm_lapic *apic)
111 {
112 	return kvm_apic_sw_enabled(apic) &&	kvm_apic_hw_enabled(apic);
113 }
114 
115 #define LVT_MASK	\
116 	(APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
117 
118 #define LINT_MASK	\
119 	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
120 	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
121 
122 static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
123 {
124 	return apic->vcpu->vcpu_id;
125 }
126 
127 static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
128 {
129 	return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
130 		(kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
131 }
132 
133 bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
134 {
135 	return kvm_x86_ops.set_hv_timer
136 	       && !(kvm_mwait_in_guest(vcpu->kvm) ||
137 		    kvm_can_post_timer_interrupt(vcpu));
138 }
139 
140 static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu)
141 {
142 	return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE;
143 }
144 
145 static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
146 {
147 	return ((id >> 4) << 16) | (1 << (id & 0xf));
148 }
149 
150 static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
151 		u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
152 	switch (map->logical_mode) {
153 	case KVM_APIC_MODE_SW_DISABLED:
154 		/* Arbitrarily use the flat map so that @cluster isn't NULL. */
155 		*cluster = map->xapic_flat_map;
156 		*mask = 0;
157 		return true;
158 	case KVM_APIC_MODE_X2APIC: {
159 		u32 offset = (dest_id >> 16) * 16;
160 		u32 max_apic_id = map->max_apic_id;
161 
162 		if (offset <= max_apic_id) {
163 			u8 cluster_size = min(max_apic_id - offset + 1, 16U);
164 
165 			offset = array_index_nospec(offset, map->max_apic_id + 1);
166 			*cluster = &map->phys_map[offset];
167 			*mask = dest_id & (0xffff >> (16 - cluster_size));
168 		} else {
169 			*mask = 0;
170 		}
171 
172 		return true;
173 		}
174 	case KVM_APIC_MODE_XAPIC_FLAT:
175 		*cluster = map->xapic_flat_map;
176 		*mask = dest_id & 0xff;
177 		return true;
178 	case KVM_APIC_MODE_XAPIC_CLUSTER:
179 		*cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf];
180 		*mask = dest_id & 0xf;
181 		return true;
182 	case KVM_APIC_MODE_MAP_DISABLED:
183 		return false;
184 	default:
185 		WARN_ON_ONCE(1);
186 		return false;
187 	}
188 }
189 
190 static int kvm_recalculate_phys_map(struct kvm_apic_map *new,
191 				    struct kvm_vcpu *vcpu,
192 				    bool *xapic_id_mismatch)
193 {
194 	struct kvm_lapic *apic = vcpu->arch.apic;
195 	u32 x2apic_id = kvm_x2apic_id(apic);
196 	u32 xapic_id = kvm_xapic_id(apic);
197 	u32 physical_id;
198 
199 	/*
200 	 * For simplicity, KVM always allocates enough space for all possible
201 	 * xAPIC IDs.  Yell, but don't kill the VM, as KVM can continue on
202 	 * without the optimized map.
203 	 */
204 	if (WARN_ON_ONCE(xapic_id > new->max_apic_id))
205 		return -EINVAL;
206 
207 	/*
208 	 * Bail if a vCPU was added and/or enabled its APIC between allocating
209 	 * the map and doing the actual calculations for the map.  Note, KVM
210 	 * hardcodes the x2APIC ID to vcpu_id, i.e. there's no TOCTOU bug if
211 	 * the compiler decides to reload x2apic_id after this check.
212 	 */
213 	if (x2apic_id > new->max_apic_id)
214 		return -E2BIG;
215 
216 	/*
217 	 * Deliberately truncate the vCPU ID when detecting a mismatched APIC
218 	 * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a
219 	 * 32-bit value.  Any unwanted aliasing due to truncation results will
220 	 * be detected below.
221 	 */
222 	if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id)
223 		*xapic_id_mismatch = true;
224 
225 	/*
226 	 * Apply KVM's hotplug hack if userspace has enable 32-bit APIC IDs.
227 	 * Allow sending events to vCPUs by their x2APIC ID even if the target
228 	 * vCPU is in legacy xAPIC mode, and silently ignore aliased xAPIC IDs
229 	 * (the x2APIC ID is truncated to 8 bits, causing IDs > 0xff to wrap
230 	 * and collide).
231 	 *
232 	 * Honor the architectural (and KVM's non-optimized) behavior if
233 	 * userspace has not enabled 32-bit x2APIC IDs.  Each APIC is supposed
234 	 * to process messages independently.  If multiple vCPUs have the same
235 	 * effective APIC ID, e.g. due to the x2APIC wrap or because the guest
236 	 * manually modified its xAPIC IDs, events targeting that ID are
237 	 * supposed to be recognized by all vCPUs with said ID.
238 	 */
239 	if (vcpu->kvm->arch.x2apic_format) {
240 		/* See also kvm_apic_match_physical_addr(). */
241 		if (apic_x2apic_mode(apic) || x2apic_id > 0xff)
242 			new->phys_map[x2apic_id] = apic;
243 
244 		if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
245 			new->phys_map[xapic_id] = apic;
246 	} else {
247 		/*
248 		 * Disable the optimized map if the physical APIC ID is already
249 		 * mapped, i.e. is aliased to multiple vCPUs.  The optimized
250 		 * map requires a strict 1:1 mapping between IDs and vCPUs.
251 		 */
252 		if (apic_x2apic_mode(apic))
253 			physical_id = x2apic_id;
254 		else
255 			physical_id = xapic_id;
256 
257 		if (new->phys_map[physical_id])
258 			return -EINVAL;
259 
260 		new->phys_map[physical_id] = apic;
261 	}
262 
263 	return 0;
264 }
265 
266 static void kvm_recalculate_logical_map(struct kvm_apic_map *new,
267 					struct kvm_vcpu *vcpu)
268 {
269 	struct kvm_lapic *apic = vcpu->arch.apic;
270 	enum kvm_apic_logical_mode logical_mode;
271 	struct kvm_lapic **cluster;
272 	u16 mask;
273 	u32 ldr;
274 
275 	if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
276 		return;
277 
278 	if (!kvm_apic_sw_enabled(apic))
279 		return;
280 
281 	ldr = kvm_lapic_get_reg(apic, APIC_LDR);
282 	if (!ldr)
283 		return;
284 
285 	if (apic_x2apic_mode(apic)) {
286 		logical_mode = KVM_APIC_MODE_X2APIC;
287 	} else {
288 		ldr = GET_APIC_LOGICAL_ID(ldr);
289 		if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
290 			logical_mode = KVM_APIC_MODE_XAPIC_FLAT;
291 		else
292 			logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER;
293 	}
294 
295 	/*
296 	 * To optimize logical mode delivery, all software-enabled APICs must
297 	 * be configured for the same mode.
298 	 */
299 	if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) {
300 		new->logical_mode = logical_mode;
301 	} else if (new->logical_mode != logical_mode) {
302 		new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
303 		return;
304 	}
305 
306 	/*
307 	 * In x2APIC mode, the LDR is read-only and derived directly from the
308 	 * x2APIC ID, thus is guaranteed to be addressable.  KVM reuses
309 	 * kvm_apic_map.phys_map to optimize logical mode x2APIC interrupts by
310 	 * reversing the LDR calculation to get cluster of APICs, i.e. no
311 	 * additional work is required.
312 	 */
313 	if (apic_x2apic_mode(apic))
314 		return;
315 
316 	if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr,
317 							&cluster, &mask))) {
318 		new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
319 		return;
320 	}
321 
322 	if (!mask)
323 		return;
324 
325 	ldr = ffs(mask) - 1;
326 	if (!is_power_of_2(mask) || cluster[ldr])
327 		new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
328 	else
329 		cluster[ldr] = apic;
330 }
331 
332 /*
333  * CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock.
334  *
335  * DIRTY -> UPDATE_IN_PROGRESS and UPDATE_IN_PROGRESS -> CLEAN happen with
336  * apic_map_lock_held.
337  */
338 enum {
339 	CLEAN,
340 	UPDATE_IN_PROGRESS,
341 	DIRTY
342 };
343 
344 static void kvm_recalculate_apic_map(struct kvm *kvm)
345 {
346 	struct kvm_apic_map *new, *old = NULL;
347 	struct kvm_vcpu *vcpu;
348 	unsigned long i;
349 	u32 max_id = 255; /* enough space for any xAPIC ID */
350 	bool xapic_id_mismatch;
351 	int r;
352 
353 	/* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map.  */
354 	if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
355 		return;
356 
357 	WARN_ONCE(!irqchip_in_kernel(kvm),
358 		  "Dirty APIC map without an in-kernel local APIC");
359 
360 	mutex_lock(&kvm->arch.apic_map_lock);
361 
362 retry:
363 	/*
364 	 * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean)
365 	 * or the APIC registers (if dirty).  Note, on retry the map may have
366 	 * not yet been marked dirty by whatever task changed a vCPU's x2APIC
367 	 * ID, i.e. the map may still show up as in-progress.  In that case
368 	 * this task still needs to retry and complete its calculation.
369 	 */
370 	if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty,
371 				   DIRTY, UPDATE_IN_PROGRESS) == CLEAN) {
372 		/* Someone else has updated the map. */
373 		mutex_unlock(&kvm->arch.apic_map_lock);
374 		return;
375 	}
376 
377 	/*
378 	 * Reset the mismatch flag between attempts so that KVM does the right
379 	 * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e.
380 	 * keep max_id strictly increasing.  Disallowing max_id from shrinking
381 	 * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU
382 	 * with the highest x2APIC ID is toggling its APIC on and off.
383 	 */
384 	xapic_id_mismatch = false;
385 
386 	kvm_for_each_vcpu(i, vcpu, kvm)
387 		if (kvm_apic_present(vcpu))
388 			max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
389 
390 	new = kvzalloc(sizeof(struct kvm_apic_map) +
391 	                   sizeof(struct kvm_lapic *) * ((u64)max_id + 1),
392 			   GFP_KERNEL_ACCOUNT);
393 
394 	if (!new)
395 		goto out;
396 
397 	new->max_apic_id = max_id;
398 	new->logical_mode = KVM_APIC_MODE_SW_DISABLED;
399 
400 	kvm_for_each_vcpu(i, vcpu, kvm) {
401 		if (!kvm_apic_present(vcpu))
402 			continue;
403 
404 		r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch);
405 		if (r) {
406 			kvfree(new);
407 			new = NULL;
408 			if (r == -E2BIG) {
409 				cond_resched();
410 				goto retry;
411 			}
412 
413 			goto out;
414 		}
415 
416 		kvm_recalculate_logical_map(new, vcpu);
417 	}
418 out:
419 	/*
420 	 * The optimized map is effectively KVM's internal version of APICv,
421 	 * and all unwanted aliasing that results in disabling the optimized
422 	 * map also applies to APICv.
423 	 */
424 	if (!new)
425 		kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
426 	else
427 		kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
428 
429 	if (!new || new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
430 		kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
431 	else
432 		kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
433 
434 	if (xapic_id_mismatch)
435 		kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
436 	else
437 		kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
438 
439 	old = rcu_dereference_protected(kvm->arch.apic_map,
440 			lockdep_is_held(&kvm->arch.apic_map_lock));
441 	rcu_assign_pointer(kvm->arch.apic_map, new);
442 	/*
443 	 * Write kvm->arch.apic_map before clearing apic->apic_map_dirty.
444 	 * If another update has come in, leave it DIRTY.
445 	 */
446 	atomic_cmpxchg_release(&kvm->arch.apic_map_dirty,
447 			       UPDATE_IN_PROGRESS, CLEAN);
448 	mutex_unlock(&kvm->arch.apic_map_lock);
449 
450 	if (old)
451 		kvfree_rcu(old, rcu);
452 
453 	kvm_make_scan_ioapic_request(kvm);
454 }
455 
456 static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
457 {
458 	bool enabled = val & APIC_SPIV_APIC_ENABLED;
459 
460 	kvm_lapic_set_reg(apic, APIC_SPIV, val);
461 
462 	if (enabled != apic->sw_enabled) {
463 		apic->sw_enabled = enabled;
464 		if (enabled)
465 			static_branch_slow_dec_deferred(&apic_sw_disabled);
466 		else
467 			static_branch_inc(&apic_sw_disabled.key);
468 
469 		atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
470 	}
471 
472 	/* Check if there are APF page ready requests pending */
473 	if (enabled) {
474 		kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
475 		kvm_xen_sw_enable_lapic(apic->vcpu);
476 	}
477 }
478 
479 static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
480 {
481 	kvm_lapic_set_reg(apic, APIC_ID, id << 24);
482 	atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
483 }
484 
485 static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
486 {
487 	kvm_lapic_set_reg(apic, APIC_LDR, id);
488 	atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
489 }
490 
491 static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
492 {
493 	kvm_lapic_set_reg(apic, APIC_DFR, val);
494 	atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
495 }
496 
497 static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
498 {
499 	u32 ldr = kvm_apic_calc_x2apic_ldr(id);
500 
501 	WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
502 
503 	kvm_lapic_set_reg(apic, APIC_ID, id);
504 	kvm_lapic_set_reg(apic, APIC_LDR, ldr);
505 	atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
506 }
507 
508 static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
509 {
510 	return !(kvm_lapic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
511 }
512 
513 static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
514 {
515 	return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT;
516 }
517 
518 static inline int apic_lvtt_period(struct kvm_lapic *apic)
519 {
520 	return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC;
521 }
522 
523 static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
524 {
525 	return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE;
526 }
527 
528 static inline int apic_lvt_nmi_mode(u32 lvt_val)
529 {
530 	return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
531 }
532 
533 static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index)
534 {
535 	return apic->nr_lvt_entries > lvt_index;
536 }
537 
538 static inline int kvm_apic_calc_nr_lvt_entries(struct kvm_vcpu *vcpu)
539 {
540 	return KVM_APIC_MAX_NR_LVT_ENTRIES - !(vcpu->arch.mcg_cap & MCG_CMCI_P);
541 }
542 
543 void kvm_apic_set_version(struct kvm_vcpu *vcpu)
544 {
545 	struct kvm_lapic *apic = vcpu->arch.apic;
546 	u32 v = 0;
547 
548 	if (!lapic_in_kernel(vcpu))
549 		return;
550 
551 	v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16);
552 
553 	/*
554 	 * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation)
555 	 * which doesn't have EOI register; Some buggy OSes (e.g. Windows with
556 	 * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC
557 	 * version first and level-triggered interrupts never get EOIed in
558 	 * IOAPIC.
559 	 */
560 	if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) &&
561 	    !ioapic_in_kernel(vcpu->kvm))
562 		v |= APIC_LVR_DIRECTED_EOI;
563 	kvm_lapic_set_reg(apic, APIC_LVR, v);
564 }
565 
566 void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu)
567 {
568 	int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
569 	struct kvm_lapic *apic = vcpu->arch.apic;
570 	int i;
571 
572 	if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries)
573 		return;
574 
575 	/* Initialize/mask any "new" LVT entries. */
576 	for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++)
577 		kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
578 
579 	apic->nr_lvt_entries = nr_lvt_entries;
580 
581 	/* The number of LVT entries is reflected in the version register. */
582 	kvm_apic_set_version(vcpu);
583 }
584 
585 static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
586 	[LVT_TIMER] = LVT_MASK,      /* timer mode mask added at runtime */
587 	[LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK,
588 	[LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK,
589 	[LVT_LINT0] = LINT_MASK,
590 	[LVT_LINT1] = LINT_MASK,
591 	[LVT_ERROR] = LVT_MASK,
592 	[LVT_CMCI] = LVT_MASK | APIC_MODE_MASK
593 };
594 
595 static u8 count_vectors(void *bitmap)
596 {
597 	int vec;
598 	u32 *reg;
599 	u8 count = 0;
600 
601 	for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
602 		reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec);
603 		count += hweight32(*reg);
604 	}
605 
606 	return count;
607 }
608 
609 bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr)
610 {
611 	unsigned long pir_vals[NR_PIR_WORDS];
612 	u32 *__pir = (void *)pir_vals;
613 	u32 i, vec;
614 	u32 irr_val, prev_irr_val;
615 	int max_updated_irr;
616 
617 	max_updated_irr = -1;
618 	*max_irr = -1;
619 
620 	if (!pi_harvest_pir(pir, pir_vals))
621 		return false;
622 
623 	for (i = vec = 0; i <= 7; i++, vec += 32) {
624 		u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);
625 
626 		irr_val = READ_ONCE(*p_irr);
627 
628 		if (__pir[i]) {
629 			prev_irr_val = irr_val;
630 			do {
631 				irr_val = prev_irr_val | __pir[i];
632 			} while (prev_irr_val != irr_val &&
633 				 !try_cmpxchg(p_irr, &prev_irr_val, irr_val));
634 
635 			if (prev_irr_val != irr_val)
636 				max_updated_irr = __fls(irr_val ^ prev_irr_val) + vec;
637 		}
638 		if (irr_val)
639 			*max_irr = __fls(irr_val) + vec;
640 	}
641 
642 	return ((max_updated_irr != -1) &&
643 		(max_updated_irr == *max_irr));
644 }
645 EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
646 
647 bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr)
648 {
649 	struct kvm_lapic *apic = vcpu->arch.apic;
650 	bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);
651 
652 	if (unlikely(!apic->apicv_active && irr_updated))
653 		apic->irr_pending = true;
654 	return irr_updated;
655 }
656 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
657 
658 static inline int apic_search_irr(struct kvm_lapic *apic)
659 {
660 	return apic_find_highest_vector(apic->regs + APIC_IRR);
661 }
662 
663 static inline int apic_find_highest_irr(struct kvm_lapic *apic)
664 {
665 	int result;
666 
667 	/*
668 	 * Note that irr_pending is just a hint. It will be always
669 	 * true with virtual interrupt delivery enabled.
670 	 */
671 	if (!apic->irr_pending)
672 		return -1;
673 
674 	result = apic_search_irr(apic);
675 	ASSERT(result == -1 || result >= 16);
676 
677 	return result;
678 }
679 
680 static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
681 {
682 	if (unlikely(apic->apicv_active)) {
683 		apic_clear_vector(vec, apic->regs + APIC_IRR);
684 	} else {
685 		apic->irr_pending = false;
686 		apic_clear_vector(vec, apic->regs + APIC_IRR);
687 		if (apic_search_irr(apic) != -1)
688 			apic->irr_pending = true;
689 	}
690 }
691 
692 void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec)
693 {
694 	apic_clear_irr(vec, vcpu->arch.apic);
695 }
696 EXPORT_SYMBOL_GPL(kvm_apic_clear_irr);
697 
698 static void *apic_vector_to_isr(int vec, struct kvm_lapic *apic)
699 {
700 	return apic->regs + APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(vec);
701 }
702 
703 static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
704 {
705 	if (__test_and_set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
706 			       apic_vector_to_isr(vec, apic)))
707 		return;
708 
709 	/*
710 	 * With APIC virtualization enabled, all caching is disabled
711 	 * because the processor can modify ISR under the hood.  Instead
712 	 * just set SVI.
713 	 */
714 	if (unlikely(apic->apicv_active))
715 		kvm_x86_call(hwapic_isr_update)(apic->vcpu, vec);
716 	else {
717 		++apic->isr_count;
718 		BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
719 		/*
720 		 * ISR (in service register) bit is set when injecting an interrupt.
721 		 * The highest vector is injected. Thus the latest bit set matches
722 		 * the highest bit in ISR.
723 		 */
724 		apic->highest_isr_cache = vec;
725 	}
726 }
727 
728 static inline int apic_find_highest_isr(struct kvm_lapic *apic)
729 {
730 	int result;
731 
732 	/*
733 	 * Note that isr_count is always 1, and highest_isr_cache
734 	 * is always -1, with APIC virtualization enabled.
735 	 */
736 	if (!apic->isr_count)
737 		return -1;
738 	if (likely(apic->highest_isr_cache != -1))
739 		return apic->highest_isr_cache;
740 
741 	result = apic_find_highest_vector(apic->regs + APIC_ISR);
742 	ASSERT(result == -1 || result >= 16);
743 
744 	return result;
745 }
746 
747 static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
748 {
749 	if (!__test_and_clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
750 				  apic_vector_to_isr(vec, apic)))
751 		return;
752 
753 	/*
754 	 * We do get here for APIC virtualization enabled if the guest
755 	 * uses the Hyper-V APIC enlightenment.  In this case we may need
756 	 * to trigger a new interrupt delivery by writing the SVI field;
757 	 * on the other hand isr_count and highest_isr_cache are unused
758 	 * and must be left alone.
759 	 */
760 	if (unlikely(apic->apicv_active))
761 		kvm_x86_call(hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic));
762 	else {
763 		--apic->isr_count;
764 		BUG_ON(apic->isr_count < 0);
765 		apic->highest_isr_cache = -1;
766 	}
767 }
768 
769 void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu)
770 {
771 	struct kvm_lapic *apic = vcpu->arch.apic;
772 
773 	if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active)
774 		return;
775 
776 	kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
777 }
778 EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr);
779 
780 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
781 {
782 	/* This may race with setting of irr in __apic_accept_irq() and
783 	 * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
784 	 * will cause vmexit immediately and the value will be recalculated
785 	 * on the next vmentry.
786 	 */
787 	return apic_find_highest_irr(vcpu->arch.apic);
788 }
789 EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
790 
791 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
792 			     int vector, int level, int trig_mode,
793 			     struct dest_map *dest_map);
794 
795 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
796 		     struct dest_map *dest_map)
797 {
798 	struct kvm_lapic *apic = vcpu->arch.apic;
799 
800 	return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
801 			irq->level, irq->trig_mode, dest_map);
802 }
803 
804 static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map,
805 			 struct kvm_lapic_irq *irq, u32 min)
806 {
807 	int i, count = 0;
808 	struct kvm_vcpu *vcpu;
809 
810 	if (min > map->max_apic_id)
811 		return 0;
812 
813 	for_each_set_bit(i, ipi_bitmap,
814 		min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
815 		if (map->phys_map[min + i]) {
816 			vcpu = map->phys_map[min + i]->vcpu;
817 			count += kvm_apic_set_irq(vcpu, irq, NULL);
818 		}
819 	}
820 
821 	return count;
822 }
823 
824 int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
825 		    unsigned long ipi_bitmap_high, u32 min,
826 		    unsigned long icr, int op_64_bit)
827 {
828 	struct kvm_apic_map *map;
829 	struct kvm_lapic_irq irq = {0};
830 	int cluster_size = op_64_bit ? 64 : 32;
831 	int count;
832 
833 	if (icr & (APIC_DEST_MASK | APIC_SHORT_MASK))
834 		return -KVM_EINVAL;
835 
836 	irq.vector = icr & APIC_VECTOR_MASK;
837 	irq.delivery_mode = icr & APIC_MODE_MASK;
838 	irq.level = (icr & APIC_INT_ASSERT) != 0;
839 	irq.trig_mode = icr & APIC_INT_LEVELTRIG;
840 
841 	rcu_read_lock();
842 	map = rcu_dereference(kvm->arch.apic_map);
843 
844 	count = -EOPNOTSUPP;
845 	if (likely(map)) {
846 		count = __pv_send_ipi(&ipi_bitmap_low, map, &irq, min);
847 		min += cluster_size;
848 		count += __pv_send_ipi(&ipi_bitmap_high, map, &irq, min);
849 	}
850 
851 	rcu_read_unlock();
852 	return count;
853 }
854 
855 static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
856 {
857 
858 	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
859 				      sizeof(val));
860 }
861 
862 static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
863 {
864 
865 	return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
866 				      sizeof(*val));
867 }
868 
869 static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
870 {
871 	return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
872 }
873 
874 static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
875 {
876 	if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0)
877 		return;
878 
879 	__set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
880 }
881 
882 static bool pv_eoi_test_and_clr_pending(struct kvm_vcpu *vcpu)
883 {
884 	u8 val;
885 
886 	if (pv_eoi_get_user(vcpu, &val) < 0)
887 		return false;
888 
889 	val &= KVM_PV_EOI_ENABLED;
890 
891 	if (val && pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0)
892 		return false;
893 
894 	/*
895 	 * Clear pending bit in any case: it will be set again on vmentry.
896 	 * While this might not be ideal from performance point of view,
897 	 * this makes sure pv eoi is only enabled when we know it's safe.
898 	 */
899 	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
900 
901 	return val;
902 }
903 
904 static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
905 {
906 	int highest_irr;
907 	if (kvm_x86_ops.sync_pir_to_irr)
908 		highest_irr = kvm_x86_call(sync_pir_to_irr)(apic->vcpu);
909 	else
910 		highest_irr = apic_find_highest_irr(apic);
911 	if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
912 		return -1;
913 	return highest_irr;
914 }
915 
916 static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr)
917 {
918 	u32 tpr, isrv, ppr, old_ppr;
919 	int isr;
920 
921 	old_ppr = kvm_lapic_get_reg(apic, APIC_PROCPRI);
922 	tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI);
923 	isr = apic_find_highest_isr(apic);
924 	isrv = (isr != -1) ? isr : 0;
925 
926 	if ((tpr & 0xf0) >= (isrv & 0xf0))
927 		ppr = tpr & 0xff;
928 	else
929 		ppr = isrv & 0xf0;
930 
931 	*new_ppr = ppr;
932 	if (old_ppr != ppr)
933 		kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr);
934 
935 	return ppr < old_ppr;
936 }
937 
938 static void apic_update_ppr(struct kvm_lapic *apic)
939 {
940 	u32 ppr;
941 
942 	if (__apic_update_ppr(apic, &ppr) &&
943 	    apic_has_interrupt_for_ppr(apic, ppr) != -1)
944 		kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
945 }
946 
947 void kvm_apic_update_ppr(struct kvm_vcpu *vcpu)
948 {
949 	apic_update_ppr(vcpu->arch.apic);
950 }
951 EXPORT_SYMBOL_GPL(kvm_apic_update_ppr);
952 
953 static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
954 {
955 	kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr);
956 	apic_update_ppr(apic);
957 }
958 
959 static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
960 {
961 	return mda == (apic_x2apic_mode(apic) ?
962 			X2APIC_BROADCAST : APIC_BROADCAST);
963 }
964 
965 static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
966 {
967 	if (kvm_apic_broadcast(apic, mda))
968 		return true;
969 
970 	/*
971 	 * Hotplug hack: Accept interrupts for vCPUs in xAPIC mode as if they
972 	 * were in x2APIC mode if the target APIC ID can't be encoded as an
973 	 * xAPIC ID.  This allows unique addressing of hotplugged vCPUs (which
974 	 * start in xAPIC mode) with an APIC ID that is unaddressable in xAPIC
975 	 * mode.  Match the x2APIC ID if and only if the target APIC ID can't
976 	 * be encoded in xAPIC to avoid spurious matches against a vCPU that
977 	 * changed its (addressable) xAPIC ID (which is writable).
978 	 */
979 	if (apic_x2apic_mode(apic) || mda > 0xff)
980 		return mda == kvm_x2apic_id(apic);
981 
982 	return mda == kvm_xapic_id(apic);
983 }
984 
985 static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
986 {
987 	u32 logical_id;
988 
989 	if (kvm_apic_broadcast(apic, mda))
990 		return true;
991 
992 	logical_id = kvm_lapic_get_reg(apic, APIC_LDR);
993 
994 	if (apic_x2apic_mode(apic))
995 		return ((logical_id >> 16) == (mda >> 16))
996 		       && (logical_id & mda & 0xffff) != 0;
997 
998 	logical_id = GET_APIC_LOGICAL_ID(logical_id);
999 
1000 	switch (kvm_lapic_get_reg(apic, APIC_DFR)) {
1001 	case APIC_DFR_FLAT:
1002 		return (logical_id & mda) != 0;
1003 	case APIC_DFR_CLUSTER:
1004 		return ((logical_id >> 4) == (mda >> 4))
1005 		       && (logical_id & mda & 0xf) != 0;
1006 	default:
1007 		return false;
1008 	}
1009 }
1010 
1011 /* The KVM local APIC implementation has two quirks:
1012  *
1013  *  - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs
1014  *    in xAPIC mode if the "destination & 0xff" matches its xAPIC ID.
1015  *    KVM doesn't do that aliasing.
1016  *
1017  *  - in-kernel IOAPIC messages have to be delivered directly to
1018  *    x2APIC, because the kernel does not support interrupt remapping.
1019  *    In order to support broadcast without interrupt remapping, x2APIC
1020  *    rewrites the destination of non-IPI messages from APIC_BROADCAST
1021  *    to X2APIC_BROADCAST.
1022  *
1023  * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API.  This is
1024  * important when userspace wants to use x2APIC-format MSIs, because
1025  * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
1026  */
1027 static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
1028 		struct kvm_lapic *source, struct kvm_lapic *target)
1029 {
1030 	bool ipi = source != NULL;
1031 
1032 	if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
1033 	    !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target))
1034 		return X2APIC_BROADCAST;
1035 
1036 	return dest_id;
1037 }
1038 
1039 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
1040 			   int shorthand, unsigned int dest, int dest_mode)
1041 {
1042 	struct kvm_lapic *target = vcpu->arch.apic;
1043 	u32 mda = kvm_apic_mda(vcpu, dest, source, target);
1044 
1045 	ASSERT(target);
1046 	switch (shorthand) {
1047 	case APIC_DEST_NOSHORT:
1048 		if (dest_mode == APIC_DEST_PHYSICAL)
1049 			return kvm_apic_match_physical_addr(target, mda);
1050 		else
1051 			return kvm_apic_match_logical_addr(target, mda);
1052 	case APIC_DEST_SELF:
1053 		return target == source;
1054 	case APIC_DEST_ALLINC:
1055 		return true;
1056 	case APIC_DEST_ALLBUT:
1057 		return target != source;
1058 	default:
1059 		return false;
1060 	}
1061 }
1062 EXPORT_SYMBOL_GPL(kvm_apic_match_dest);
1063 
1064 int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
1065 		       const unsigned long *bitmap, u32 bitmap_size)
1066 {
1067 	u32 mod;
1068 	int i, idx = -1;
1069 
1070 	mod = vector % dest_vcpus;
1071 
1072 	for (i = 0; i <= mod; i++) {
1073 		idx = find_next_bit(bitmap, bitmap_size, idx + 1);
1074 		BUG_ON(idx == bitmap_size);
1075 	}
1076 
1077 	return idx;
1078 }
1079 
1080 static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
1081 {
1082 	if (!kvm->arch.disabled_lapic_found) {
1083 		kvm->arch.disabled_lapic_found = true;
1084 		pr_info("Disabled LAPIC found during irq injection\n");
1085 	}
1086 }
1087 
1088 static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
1089 		struct kvm_lapic_irq *irq, struct kvm_apic_map *map)
1090 {
1091 	if (kvm->arch.x2apic_broadcast_quirk_disabled) {
1092 		if ((irq->dest_id == APIC_BROADCAST &&
1093 		     map->logical_mode != KVM_APIC_MODE_X2APIC))
1094 			return true;
1095 		if (irq->dest_id == X2APIC_BROADCAST)
1096 			return true;
1097 	} else {
1098 		bool x2apic_ipi = src && *src && apic_x2apic_mode(*src);
1099 		if (irq->dest_id == (x2apic_ipi ?
1100 		                     X2APIC_BROADCAST : APIC_BROADCAST))
1101 			return true;
1102 	}
1103 
1104 	return false;
1105 }
1106 
1107 /* Return true if the interrupt can be handled by using *bitmap as index mask
1108  * for valid destinations in *dst array.
1109  * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
1110  * Note: we may have zero kvm_lapic destinations when we return true, which
1111  * means that the interrupt should be dropped.  In this case, *bitmap would be
1112  * zero and *dst undefined.
1113  */
1114 static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
1115 		struct kvm_lapic **src, struct kvm_lapic_irq *irq,
1116 		struct kvm_apic_map *map, struct kvm_lapic ***dst,
1117 		unsigned long *bitmap)
1118 {
1119 	int i, lowest;
1120 
1121 	if (irq->shorthand == APIC_DEST_SELF && src) {
1122 		*dst = src;
1123 		*bitmap = 1;
1124 		return true;
1125 	} else if (irq->shorthand)
1126 		return false;
1127 
1128 	if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map))
1129 		return false;
1130 
1131 	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
1132 		if (irq->dest_id > map->max_apic_id) {
1133 			*bitmap = 0;
1134 		} else {
1135 			u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1);
1136 			*dst = &map->phys_map[dest_id];
1137 			*bitmap = 1;
1138 		}
1139 		return true;
1140 	}
1141 
1142 	*bitmap = 0;
1143 	if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst,
1144 				(u16 *)bitmap))
1145 		return false;
1146 
1147 	if (!kvm_lowest_prio_delivery(irq))
1148 		return true;
1149 
1150 	if (!kvm_vector_hashing_enabled()) {
1151 		lowest = -1;
1152 		for_each_set_bit(i, bitmap, 16) {
1153 			if (!(*dst)[i])
1154 				continue;
1155 			if (lowest < 0)
1156 				lowest = i;
1157 			else if (kvm_apic_compare_prio((*dst)[i]->vcpu,
1158 						(*dst)[lowest]->vcpu) < 0)
1159 				lowest = i;
1160 		}
1161 	} else {
1162 		if (!*bitmap)
1163 			return true;
1164 
1165 		lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap),
1166 				bitmap, 16);
1167 
1168 		if (!(*dst)[lowest]) {
1169 			kvm_apic_disabled_lapic_found(kvm);
1170 			*bitmap = 0;
1171 			return true;
1172 		}
1173 	}
1174 
1175 	*bitmap = (lowest >= 0) ? 1 << lowest : 0;
1176 
1177 	return true;
1178 }
1179 
1180 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
1181 		struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
1182 {
1183 	struct kvm_apic_map *map;
1184 	unsigned long bitmap;
1185 	struct kvm_lapic **dst = NULL;
1186 	int i;
1187 	bool ret;
1188 
1189 	*r = -1;
1190 
1191 	if (irq->shorthand == APIC_DEST_SELF) {
1192 		if (KVM_BUG_ON(!src, kvm)) {
1193 			*r = 0;
1194 			return true;
1195 		}
1196 		*r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
1197 		return true;
1198 	}
1199 
1200 	rcu_read_lock();
1201 	map = rcu_dereference(kvm->arch.apic_map);
1202 
1203 	ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
1204 	if (ret) {
1205 		*r = 0;
1206 		for_each_set_bit(i, &bitmap, 16) {
1207 			if (!dst[i])
1208 				continue;
1209 			*r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
1210 		}
1211 	}
1212 
1213 	rcu_read_unlock();
1214 	return ret;
1215 }
1216 
1217 /*
1218  * This routine tries to handle interrupts in posted mode, here is how
1219  * it deals with different cases:
1220  * - For single-destination interrupts, handle it in posted mode
1221  * - Else if vector hashing is enabled and it is a lowest-priority
1222  *   interrupt, handle it in posted mode and use the following mechanism
1223  *   to find the destination vCPU.
1224  *	1. For lowest-priority interrupts, store all the possible
1225  *	   destination vCPUs in an array.
1226  *	2. Use "guest vector % max number of destination vCPUs" to find
1227  *	   the right destination vCPU in the array for the lowest-priority
1228  *	   interrupt.
1229  * - Otherwise, use remapped mode to inject the interrupt.
1230  */
1231 bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
1232 			struct kvm_vcpu **dest_vcpu)
1233 {
1234 	struct kvm_apic_map *map;
1235 	unsigned long bitmap;
1236 	struct kvm_lapic **dst = NULL;
1237 	bool ret = false;
1238 
1239 	if (irq->shorthand)
1240 		return false;
1241 
1242 	rcu_read_lock();
1243 	map = rcu_dereference(kvm->arch.apic_map);
1244 
1245 	if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
1246 			hweight16(bitmap) == 1) {
1247 		unsigned long i = find_first_bit(&bitmap, 16);
1248 
1249 		if (dst[i]) {
1250 			*dest_vcpu = dst[i]->vcpu;
1251 			ret = true;
1252 		}
1253 	}
1254 
1255 	rcu_read_unlock();
1256 	return ret;
1257 }
1258 
1259 /*
1260  * Add a pending IRQ into lapic.
1261  * Return 1 if successfully added and 0 if discarded.
1262  */
1263 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
1264 			     int vector, int level, int trig_mode,
1265 			     struct dest_map *dest_map)
1266 {
1267 	int result = 0;
1268 	struct kvm_vcpu *vcpu = apic->vcpu;
1269 
1270 	trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
1271 				  trig_mode, vector);
1272 	switch (delivery_mode) {
1273 	case APIC_DM_LOWEST:
1274 		vcpu->arch.apic_arb_prio++;
1275 		fallthrough;
1276 	case APIC_DM_FIXED:
1277 		if (unlikely(trig_mode && !level))
1278 			break;
1279 
1280 		/* FIXME add logic for vcpu on reset */
1281 		if (unlikely(!apic_enabled(apic)))
1282 			break;
1283 
1284 		result = 1;
1285 
1286 		if (dest_map) {
1287 			__set_bit(vcpu->vcpu_id, dest_map->map);
1288 			dest_map->vectors[vcpu->vcpu_id] = vector;
1289 		}
1290 
1291 		if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
1292 			if (trig_mode)
1293 				apic_set_vector(vector, apic->regs + APIC_TMR);
1294 			else
1295 				apic_clear_vector(vector, apic->regs + APIC_TMR);
1296 		}
1297 
1298 		kvm_x86_call(deliver_interrupt)(apic, delivery_mode,
1299 						trig_mode, vector);
1300 		break;
1301 
1302 	case APIC_DM_REMRD:
1303 		result = 1;
1304 		vcpu->arch.pv.pv_unhalted = 1;
1305 		kvm_make_request(KVM_REQ_EVENT, vcpu);
1306 		kvm_vcpu_kick(vcpu);
1307 		break;
1308 
1309 	case APIC_DM_SMI:
1310 		if (!kvm_inject_smi(vcpu)) {
1311 			kvm_vcpu_kick(vcpu);
1312 			result = 1;
1313 		}
1314 		break;
1315 
1316 	case APIC_DM_NMI:
1317 		result = 1;
1318 		kvm_inject_nmi(vcpu);
1319 		kvm_vcpu_kick(vcpu);
1320 		break;
1321 
1322 	case APIC_DM_INIT:
1323 		if (!trig_mode || level) {
1324 			result = 1;
1325 			/* assumes that there are only KVM_APIC_INIT/SIPI */
1326 			apic->pending_events = (1UL << KVM_APIC_INIT);
1327 			kvm_make_request(KVM_REQ_EVENT, vcpu);
1328 			kvm_vcpu_kick(vcpu);
1329 		}
1330 		break;
1331 
1332 	case APIC_DM_STARTUP:
1333 		result = 1;
1334 		apic->sipi_vector = vector;
1335 		/* make sure sipi_vector is visible for the receiver */
1336 		smp_wmb();
1337 		set_bit(KVM_APIC_SIPI, &apic->pending_events);
1338 		kvm_make_request(KVM_REQ_EVENT, vcpu);
1339 		kvm_vcpu_kick(vcpu);
1340 		break;
1341 
1342 	case APIC_DM_EXTINT:
1343 		/*
1344 		 * Should only be called by kvm_apic_local_deliver() with LVT0,
1345 		 * before NMI watchdog was enabled. Already handled by
1346 		 * kvm_apic_accept_pic_intr().
1347 		 */
1348 		break;
1349 
1350 	default:
1351 		printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
1352 		       delivery_mode);
1353 		break;
1354 	}
1355 	return result;
1356 }
1357 
1358 /*
1359  * This routine identifies the destination vcpus mask meant to receive the
1360  * IOAPIC interrupts. It either uses kvm_apic_map_get_dest_lapic() to find
1361  * out the destination vcpus array and set the bitmap or it traverses to
1362  * each available vcpu to identify the same.
1363  */
1364 void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
1365 			      unsigned long *vcpu_bitmap)
1366 {
1367 	struct kvm_lapic **dest_vcpu = NULL;
1368 	struct kvm_lapic *src = NULL;
1369 	struct kvm_apic_map *map;
1370 	struct kvm_vcpu *vcpu;
1371 	unsigned long bitmap, i;
1372 	int vcpu_idx;
1373 	bool ret;
1374 
1375 	rcu_read_lock();
1376 	map = rcu_dereference(kvm->arch.apic_map);
1377 
1378 	ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dest_vcpu,
1379 					  &bitmap);
1380 	if (ret) {
1381 		for_each_set_bit(i, &bitmap, 16) {
1382 			if (!dest_vcpu[i])
1383 				continue;
1384 			vcpu_idx = dest_vcpu[i]->vcpu->vcpu_idx;
1385 			__set_bit(vcpu_idx, vcpu_bitmap);
1386 		}
1387 	} else {
1388 		kvm_for_each_vcpu(i, vcpu, kvm) {
1389 			if (!kvm_apic_present(vcpu))
1390 				continue;
1391 			if (!kvm_apic_match_dest(vcpu, NULL,
1392 						 irq->shorthand,
1393 						 irq->dest_id,
1394 						 irq->dest_mode))
1395 				continue;
1396 			__set_bit(i, vcpu_bitmap);
1397 		}
1398 	}
1399 	rcu_read_unlock();
1400 }
1401 
1402 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
1403 {
1404 	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
1405 }
1406 
1407 static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
1408 {
1409 	return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors);
1410 }
1411 
1412 static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
1413 {
1414 	int __maybe_unused trigger_mode;
1415 
1416 	/* Eoi the ioapic only if the ioapic doesn't own the vector. */
1417 	if (!kvm_ioapic_handles_vector(apic, vector))
1418 		return;
1419 
1420 	/*
1421 	 * If the intercepted EOI is for an IRQ that was pending from previous
1422 	 * routing, then re-scan the I/O APIC routes as EOIs for the IRQ likely
1423 	 * no longer need to be intercepted.
1424 	 */
1425 	if (apic->vcpu->arch.highest_stale_pending_ioapic_eoi == vector)
1426 		kvm_make_request(KVM_REQ_SCAN_IOAPIC, apic->vcpu);
1427 
1428 	/* Request a KVM exit to inform the userspace IOAPIC. */
1429 	if (irqchip_split(apic->vcpu->kvm)) {
1430 		apic->vcpu->arch.pending_ioapic_eoi = vector;
1431 		kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
1432 		return;
1433 	}
1434 
1435 #ifdef CONFIG_KVM_IOAPIC
1436 	if (apic_test_vector(vector, apic->regs + APIC_TMR))
1437 		trigger_mode = IOAPIC_LEVEL_TRIG;
1438 	else
1439 		trigger_mode = IOAPIC_EDGE_TRIG;
1440 
1441 	kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
1442 #endif
1443 }
1444 
1445 static int apic_set_eoi(struct kvm_lapic *apic)
1446 {
1447 	int vector = apic_find_highest_isr(apic);
1448 
1449 	trace_kvm_eoi(apic, vector);
1450 
1451 	/*
1452 	 * Not every write EOI will has corresponding ISR,
1453 	 * one example is when Kernel check timer on setup_IO_APIC
1454 	 */
1455 	if (vector == -1)
1456 		return vector;
1457 
1458 	apic_clear_isr(vector, apic);
1459 	apic_update_ppr(apic);
1460 
1461 	if (kvm_hv_synic_has_vector(apic->vcpu, vector))
1462 		kvm_hv_synic_send_eoi(apic->vcpu, vector);
1463 
1464 	kvm_ioapic_send_eoi(apic, vector);
1465 	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
1466 	return vector;
1467 }
1468 
1469 /*
1470  * this interface assumes a trap-like exit, which has already finished
1471  * desired side effect including vISR and vPPR update.
1472  */
1473 void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
1474 {
1475 	struct kvm_lapic *apic = vcpu->arch.apic;
1476 
1477 	trace_kvm_eoi(apic, vector);
1478 
1479 	kvm_ioapic_send_eoi(apic, vector);
1480 	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
1481 }
1482 EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
1483 
1484 void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
1485 {
1486 	struct kvm_lapic_irq irq;
1487 
1488 	/* KVM has no delay and should always clear the BUSY/PENDING flag. */
1489 	WARN_ON_ONCE(icr_low & APIC_ICR_BUSY);
1490 
1491 	irq.vector = icr_low & APIC_VECTOR_MASK;
1492 	irq.delivery_mode = icr_low & APIC_MODE_MASK;
1493 	irq.dest_mode = icr_low & APIC_DEST_MASK;
1494 	irq.level = (icr_low & APIC_INT_ASSERT) != 0;
1495 	irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
1496 	irq.shorthand = icr_low & APIC_SHORT_MASK;
1497 	irq.msi_redir_hint = false;
1498 	if (apic_x2apic_mode(apic))
1499 		irq.dest_id = icr_high;
1500 	else
1501 		irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high);
1502 
1503 	trace_kvm_apic_ipi(icr_low, irq.dest_id);
1504 
1505 	kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
1506 }
1507 EXPORT_SYMBOL_GPL(kvm_apic_send_ipi);
1508 
1509 static u32 apic_get_tmcct(struct kvm_lapic *apic)
1510 {
1511 	ktime_t remaining, now;
1512 	s64 ns;
1513 
1514 	ASSERT(apic != NULL);
1515 
1516 	/* if initial count is 0, current count should also be 0 */
1517 	if (kvm_lapic_get_reg(apic, APIC_TMICT) == 0 ||
1518 		apic->lapic_timer.period == 0)
1519 		return 0;
1520 
1521 	now = ktime_get();
1522 	remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
1523 	if (ktime_to_ns(remaining) < 0)
1524 		remaining = 0;
1525 
1526 	ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
1527 	return div64_u64(ns, (apic->vcpu->kvm->arch.apic_bus_cycle_ns *
1528 			      apic->divide_count));
1529 }
1530 
1531 static void __report_tpr_access(struct kvm_lapic *apic, bool write)
1532 {
1533 	struct kvm_vcpu *vcpu = apic->vcpu;
1534 	struct kvm_run *run = vcpu->run;
1535 
1536 	kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
1537 	run->tpr_access.rip = kvm_rip_read(vcpu);
1538 	run->tpr_access.is_write = write;
1539 }
1540 
1541 static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
1542 {
1543 	if (apic->vcpu->arch.tpr_access_reporting)
1544 		__report_tpr_access(apic, write);
1545 }
1546 
1547 static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
1548 {
1549 	u32 val = 0;
1550 
1551 	if (offset >= LAPIC_MMIO_LENGTH)
1552 		return 0;
1553 
1554 	switch (offset) {
1555 	case APIC_ARBPRI:
1556 		break;
1557 
1558 	case APIC_TMCCT:	/* Timer CCR */
1559 		if (apic_lvtt_tscdeadline(apic))
1560 			return 0;
1561 
1562 		val = apic_get_tmcct(apic);
1563 		break;
1564 	case APIC_PROCPRI:
1565 		apic_update_ppr(apic);
1566 		val = kvm_lapic_get_reg(apic, offset);
1567 		break;
1568 	case APIC_TASKPRI:
1569 		report_tpr_access(apic, false);
1570 		fallthrough;
1571 	default:
1572 		val = kvm_lapic_get_reg(apic, offset);
1573 		break;
1574 	}
1575 
1576 	return val;
1577 }
1578 
1579 static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
1580 {
1581 	return container_of(dev, struct kvm_lapic, dev);
1582 }
1583 
1584 #define APIC_REG_MASK(reg)	(1ull << ((reg) >> 4))
1585 #define APIC_REGS_MASK(first, count) \
1586 	(APIC_REG_MASK(first) * ((1ull << (count)) - 1))
1587 
1588 u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic)
1589 {
1590 	/* Leave bits '0' for reserved and write-only registers. */
1591 	u64 valid_reg_mask =
1592 		APIC_REG_MASK(APIC_ID) |
1593 		APIC_REG_MASK(APIC_LVR) |
1594 		APIC_REG_MASK(APIC_TASKPRI) |
1595 		APIC_REG_MASK(APIC_PROCPRI) |
1596 		APIC_REG_MASK(APIC_LDR) |
1597 		APIC_REG_MASK(APIC_SPIV) |
1598 		APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) |
1599 		APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) |
1600 		APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) |
1601 		APIC_REG_MASK(APIC_ESR) |
1602 		APIC_REG_MASK(APIC_ICR) |
1603 		APIC_REG_MASK(APIC_LVTT) |
1604 		APIC_REG_MASK(APIC_LVTTHMR) |
1605 		APIC_REG_MASK(APIC_LVTPC) |
1606 		APIC_REG_MASK(APIC_LVT0) |
1607 		APIC_REG_MASK(APIC_LVT1) |
1608 		APIC_REG_MASK(APIC_LVTERR) |
1609 		APIC_REG_MASK(APIC_TMICT) |
1610 		APIC_REG_MASK(APIC_TMCCT) |
1611 		APIC_REG_MASK(APIC_TDCR);
1612 
1613 	if (kvm_lapic_lvt_supported(apic, LVT_CMCI))
1614 		valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
1615 
1616 	/* ARBPRI, DFR, and ICR2 are not valid in x2APIC mode. */
1617 	if (!apic_x2apic_mode(apic))
1618 		valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) |
1619 				  APIC_REG_MASK(APIC_DFR) |
1620 				  APIC_REG_MASK(APIC_ICR2);
1621 
1622 	return valid_reg_mask;
1623 }
1624 EXPORT_SYMBOL_GPL(kvm_lapic_readable_reg_mask);
1625 
1626 static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
1627 			      void *data)
1628 {
1629 	unsigned char alignment = offset & 0xf;
1630 	u32 result;
1631 
1632 	/*
1633 	 * WARN if KVM reads ICR in x2APIC mode, as it's an 8-byte register in
1634 	 * x2APIC and needs to be manually handled by the caller.
1635 	 */
1636 	WARN_ON_ONCE(apic_x2apic_mode(apic) && offset == APIC_ICR);
1637 
1638 	if (alignment + len > 4)
1639 		return 1;
1640 
1641 	if (offset > 0x3f0 ||
1642 	    !(kvm_lapic_readable_reg_mask(apic) & APIC_REG_MASK(offset)))
1643 		return 1;
1644 
1645 	result = __apic_read(apic, offset & ~0xf);
1646 
1647 	trace_kvm_apic_read(offset, result);
1648 
1649 	switch (len) {
1650 	case 1:
1651 	case 2:
1652 	case 4:
1653 		memcpy(data, (char *)&result + alignment, len);
1654 		break;
1655 	default:
1656 		printk(KERN_ERR "Local APIC read with len = %x, "
1657 		       "should be 1,2, or 4 instead\n", len);
1658 		break;
1659 	}
1660 	return 0;
1661 }
1662 
1663 static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
1664 {
1665 	return addr >= apic->base_address &&
1666 		addr < apic->base_address + LAPIC_MMIO_LENGTH;
1667 }
1668 
1669 static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
1670 			   gpa_t address, int len, void *data)
1671 {
1672 	struct kvm_lapic *apic = to_lapic(this);
1673 	u32 offset = address - apic->base_address;
1674 
1675 	if (!apic_mmio_in_range(apic, address))
1676 		return -EOPNOTSUPP;
1677 
1678 	if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
1679 		if (!kvm_check_has_quirk(vcpu->kvm,
1680 					 KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
1681 			return -EOPNOTSUPP;
1682 
1683 		memset(data, 0xff, len);
1684 		return 0;
1685 	}
1686 
1687 	kvm_lapic_reg_read(apic, offset, len, data);
1688 
1689 	return 0;
1690 }
1691 
1692 static void update_divide_count(struct kvm_lapic *apic)
1693 {
1694 	u32 tmp1, tmp2, tdcr;
1695 
1696 	tdcr = kvm_lapic_get_reg(apic, APIC_TDCR);
1697 	tmp1 = tdcr & 0xf;
1698 	tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
1699 	apic->divide_count = 0x1 << (tmp2 & 0x7);
1700 }
1701 
1702 static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
1703 {
1704 	/*
1705 	 * Do not allow the guest to program periodic timers with small
1706 	 * interval, since the hrtimers are not throttled by the host
1707 	 * scheduler.
1708 	 */
1709 	if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
1710 		s64 min_period = min_timer_period_us * 1000LL;
1711 
1712 		if (apic->lapic_timer.period < min_period) {
1713 			pr_info_once(
1714 			    "vcpu %i: requested %lld ns "
1715 			    "lapic timer period limited to %lld ns\n",
1716 			    apic->vcpu->vcpu_id,
1717 			    apic->lapic_timer.period, min_period);
1718 			apic->lapic_timer.period = min_period;
1719 		}
1720 	}
1721 }
1722 
1723 static void cancel_hv_timer(struct kvm_lapic *apic);
1724 
1725 static void cancel_apic_timer(struct kvm_lapic *apic)
1726 {
1727 	hrtimer_cancel(&apic->lapic_timer.timer);
1728 	preempt_disable();
1729 	if (apic->lapic_timer.hv_timer_in_use)
1730 		cancel_hv_timer(apic);
1731 	preempt_enable();
1732 	atomic_set(&apic->lapic_timer.pending, 0);
1733 }
1734 
1735 static void apic_update_lvtt(struct kvm_lapic *apic)
1736 {
1737 	u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
1738 			apic->lapic_timer.timer_mode_mask;
1739 
1740 	if (apic->lapic_timer.timer_mode != timer_mode) {
1741 		if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
1742 				APIC_LVT_TIMER_TSCDEADLINE)) {
1743 			cancel_apic_timer(apic);
1744 			kvm_lapic_set_reg(apic, APIC_TMICT, 0);
1745 			apic->lapic_timer.period = 0;
1746 			apic->lapic_timer.tscdeadline = 0;
1747 		}
1748 		apic->lapic_timer.timer_mode = timer_mode;
1749 		limit_periodic_timer_frequency(apic);
1750 	}
1751 }
1752 
1753 /*
1754  * On APICv, this test will cause a busy wait
1755  * during a higher-priority task.
1756  */
1757 
1758 static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
1759 {
1760 	struct kvm_lapic *apic = vcpu->arch.apic;
1761 	u32 reg;
1762 
1763 	/*
1764 	 * Assume a timer IRQ was "injected" if the APIC is protected.  KVM's
1765 	 * copy of the vIRR is bogus, it's the responsibility of the caller to
1766 	 * precisely check whether or not a timer IRQ is pending.
1767 	 */
1768 	if (apic->guest_apic_protected)
1769 		return true;
1770 
1771 	reg = kvm_lapic_get_reg(apic, APIC_LVTT);
1772 	if (kvm_apic_hw_enabled(apic)) {
1773 		int vec = reg & APIC_VECTOR_MASK;
1774 		void *bitmap = apic->regs + APIC_ISR;
1775 
1776 		if (apic->apicv_active)
1777 			bitmap = apic->regs + APIC_IRR;
1778 
1779 		if (apic_test_vector(vec, bitmap))
1780 			return true;
1781 	}
1782 	return false;
1783 }
1784 
1785 static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
1786 {
1787 	u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns;
1788 
1789 	/*
1790 	 * If the guest TSC is running at a different ratio than the host, then
1791 	 * convert the delay to nanoseconds to achieve an accurate delay.  Note
1792 	 * that __delay() uses delay_tsc whenever the hardware has TSC, thus
1793 	 * always for VMX enabled hardware.
1794 	 */
1795 	if (vcpu->arch.tsc_scaling_ratio == kvm_caps.default_tsc_scaling_ratio) {
1796 		__delay(min(guest_cycles,
1797 			nsec_to_cycles(vcpu, timer_advance_ns)));
1798 	} else {
1799 		u64 delay_ns = guest_cycles * 1000000ULL;
1800 		do_div(delay_ns, vcpu->arch.virtual_tsc_khz);
1801 		ndelay(min_t(u32, delay_ns, timer_advance_ns));
1802 	}
1803 }
1804 
1805 static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
1806 					      s64 advance_expire_delta)
1807 {
1808 	struct kvm_lapic *apic = vcpu->arch.apic;
1809 	u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
1810 	u64 ns;
1811 
1812 	/* Do not adjust for tiny fluctuations or large random spikes. */
1813 	if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_MAX ||
1814 	    abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_MIN)
1815 		return;
1816 
1817 	/* too early */
1818 	if (advance_expire_delta < 0) {
1819 		ns = -advance_expire_delta * 1000000ULL;
1820 		do_div(ns, vcpu->arch.virtual_tsc_khz);
1821 		timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
1822 	} else {
1823 	/* too late */
1824 		ns = advance_expire_delta * 1000000ULL;
1825 		do_div(ns, vcpu->arch.virtual_tsc_khz);
1826 		timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
1827 	}
1828 
1829 	if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX))
1830 		timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
1831 	apic->lapic_timer.timer_advance_ns = timer_advance_ns;
1832 }
1833 
1834 static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
1835 {
1836 	struct kvm_lapic *apic = vcpu->arch.apic;
1837 	u64 guest_tsc, tsc_deadline;
1838 
1839 	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
1840 	apic->lapic_timer.expired_tscdeadline = 0;
1841 	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
1842 	trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
1843 
1844 	adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline);
1845 
1846 	/*
1847 	 * If the timer fired early, reread the TSC to account for the overhead
1848 	 * of the above adjustment to avoid waiting longer than is necessary.
1849 	 */
1850 	if (guest_tsc < tsc_deadline)
1851 		guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
1852 
1853 	if (guest_tsc < tsc_deadline)
1854 		__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
1855 }
1856 
1857 void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
1858 {
1859 	if (lapic_in_kernel(vcpu) &&
1860 	    vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
1861 	    vcpu->arch.apic->lapic_timer.timer_advance_ns &&
1862 	    lapic_timer_int_injected(vcpu))
1863 		__kvm_wait_lapic_expire(vcpu);
1864 }
1865 EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
1866 
1867 static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic)
1868 {
1869 	struct kvm_timer *ktimer = &apic->lapic_timer;
1870 
1871 	kvm_apic_local_deliver(apic, APIC_LVTT);
1872 	if (apic_lvtt_tscdeadline(apic)) {
1873 		ktimer->tscdeadline = 0;
1874 	} else if (apic_lvtt_oneshot(apic)) {
1875 		ktimer->tscdeadline = 0;
1876 		ktimer->target_expiration = 0;
1877 	}
1878 }
1879 
1880 static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
1881 {
1882 	struct kvm_vcpu *vcpu = apic->vcpu;
1883 	struct kvm_timer *ktimer = &apic->lapic_timer;
1884 
1885 	if (atomic_read(&apic->lapic_timer.pending))
1886 		return;
1887 
1888 	if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use)
1889 		ktimer->expired_tscdeadline = ktimer->tscdeadline;
1890 
1891 	if (!from_timer_fn && apic->apicv_active) {
1892 		WARN_ON(kvm_get_running_vcpu() != vcpu);
1893 		kvm_apic_inject_pending_timer_irqs(apic);
1894 		return;
1895 	}
1896 
1897 	if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
1898 		/*
1899 		 * Ensure the guest's timer has truly expired before posting an
1900 		 * interrupt.  Open code the relevant checks to avoid querying
1901 		 * lapic_timer_int_injected(), which will be false since the
1902 		 * interrupt isn't yet injected.  Waiting until after injecting
1903 		 * is not an option since that won't help a posted interrupt.
1904 		 */
1905 		if (vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
1906 		    vcpu->arch.apic->lapic_timer.timer_advance_ns)
1907 			__kvm_wait_lapic_expire(vcpu);
1908 		kvm_apic_inject_pending_timer_irqs(apic);
1909 		return;
1910 	}
1911 
1912 	atomic_inc(&apic->lapic_timer.pending);
1913 	kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1914 	if (from_timer_fn)
1915 		kvm_vcpu_kick(vcpu);
1916 }
1917 
1918 static void start_sw_tscdeadline(struct kvm_lapic *apic)
1919 {
1920 	struct kvm_timer *ktimer = &apic->lapic_timer;
1921 	u64 guest_tsc, tscdeadline = ktimer->tscdeadline;
1922 	u64 ns = 0;
1923 	ktime_t expire;
1924 	struct kvm_vcpu *vcpu = apic->vcpu;
1925 	u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz;
1926 	unsigned long flags;
1927 	ktime_t now;
1928 
1929 	if (unlikely(!tscdeadline || !this_tsc_khz))
1930 		return;
1931 
1932 	local_irq_save(flags);
1933 
1934 	now = ktime_get();
1935 	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
1936 
1937 	ns = (tscdeadline - guest_tsc) * 1000000ULL;
1938 	do_div(ns, this_tsc_khz);
1939 
1940 	if (likely(tscdeadline > guest_tsc) &&
1941 	    likely(ns > apic->lapic_timer.timer_advance_ns)) {
1942 		expire = ktime_add_ns(now, ns);
1943 		expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
1944 		hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD);
1945 	} else
1946 		apic_timer_expired(apic, false);
1947 
1948 	local_irq_restore(flags);
1949 }
1950 
1951 static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict)
1952 {
1953 	return (u64)tmict * apic->vcpu->kvm->arch.apic_bus_cycle_ns *
1954 		(u64)apic->divide_count;
1955 }
1956 
1957 static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
1958 {
1959 	ktime_t now, remaining;
1960 	u64 ns_remaining_old, ns_remaining_new;
1961 
1962 	apic->lapic_timer.period =
1963 			tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
1964 	limit_periodic_timer_frequency(apic);
1965 
1966 	now = ktime_get();
1967 	remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
1968 	if (ktime_to_ns(remaining) < 0)
1969 		remaining = 0;
1970 
1971 	ns_remaining_old = ktime_to_ns(remaining);
1972 	ns_remaining_new = mul_u64_u32_div(ns_remaining_old,
1973 	                                   apic->divide_count, old_divisor);
1974 
1975 	apic->lapic_timer.tscdeadline +=
1976 		nsec_to_cycles(apic->vcpu, ns_remaining_new) -
1977 		nsec_to_cycles(apic->vcpu, ns_remaining_old);
1978 	apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
1979 }
1980 
1981 static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
1982 {
1983 	ktime_t now;
1984 	u64 tscl = rdtsc();
1985 	s64 deadline;
1986 
1987 	now = ktime_get();
1988 	apic->lapic_timer.period =
1989 			tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
1990 
1991 	if (!apic->lapic_timer.period) {
1992 		apic->lapic_timer.tscdeadline = 0;
1993 		return false;
1994 	}
1995 
1996 	limit_periodic_timer_frequency(apic);
1997 	deadline = apic->lapic_timer.period;
1998 
1999 	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
2000 		if (unlikely(count_reg != APIC_TMICT)) {
2001 			deadline = tmict_to_ns(apic,
2002 				     kvm_lapic_get_reg(apic, count_reg));
2003 			if (unlikely(deadline <= 0)) {
2004 				if (apic_lvtt_period(apic))
2005 					deadline = apic->lapic_timer.period;
2006 				else
2007 					deadline = 0;
2008 			}
2009 			else if (unlikely(deadline > apic->lapic_timer.period)) {
2010 				pr_info_ratelimited(
2011 				    "vcpu %i: requested lapic timer restore with "
2012 				    "starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
2013 				    "Using initial count to start timer.\n",
2014 				    apic->vcpu->vcpu_id,
2015 				    count_reg,
2016 				    kvm_lapic_get_reg(apic, count_reg),
2017 				    deadline, apic->lapic_timer.period);
2018 				kvm_lapic_set_reg(apic, count_reg, 0);
2019 				deadline = apic->lapic_timer.period;
2020 			}
2021 		}
2022 	}
2023 
2024 	apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
2025 		nsec_to_cycles(apic->vcpu, deadline);
2026 	apic->lapic_timer.target_expiration = ktime_add_ns(now, deadline);
2027 
2028 	return true;
2029 }
2030 
2031 static void advance_periodic_target_expiration(struct kvm_lapic *apic)
2032 {
2033 	ktime_t now = ktime_get();
2034 	u64 tscl = rdtsc();
2035 	ktime_t delta;
2036 
2037 	/*
2038 	 * Synchronize both deadlines to the same time source or
2039 	 * differences in the periods (caused by differences in the
2040 	 * underlying clocks or numerical approximation errors) will
2041 	 * cause the two to drift apart over time as the errors
2042 	 * accumulate.
2043 	 */
2044 	apic->lapic_timer.target_expiration =
2045 		ktime_add_ns(apic->lapic_timer.target_expiration,
2046 				apic->lapic_timer.period);
2047 	delta = ktime_sub(apic->lapic_timer.target_expiration, now);
2048 	apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
2049 		nsec_to_cycles(apic->vcpu, delta);
2050 }
2051 
2052 static void start_sw_period(struct kvm_lapic *apic)
2053 {
2054 	if (!apic->lapic_timer.period)
2055 		return;
2056 
2057 	if (ktime_after(ktime_get(),
2058 			apic->lapic_timer.target_expiration)) {
2059 		apic_timer_expired(apic, false);
2060 
2061 		if (apic_lvtt_oneshot(apic))
2062 			return;
2063 
2064 		advance_periodic_target_expiration(apic);
2065 	}
2066 
2067 	hrtimer_start(&apic->lapic_timer.timer,
2068 		apic->lapic_timer.target_expiration,
2069 		HRTIMER_MODE_ABS_HARD);
2070 }
2071 
2072 bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
2073 {
2074 	if (!lapic_in_kernel(vcpu))
2075 		return false;
2076 
2077 	return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
2078 }
2079 
2080 static void cancel_hv_timer(struct kvm_lapic *apic)
2081 {
2082 	WARN_ON(preemptible());
2083 	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
2084 	kvm_x86_call(cancel_hv_timer)(apic->vcpu);
2085 	apic->lapic_timer.hv_timer_in_use = false;
2086 }
2087 
2088 static bool start_hv_timer(struct kvm_lapic *apic)
2089 {
2090 	struct kvm_timer *ktimer = &apic->lapic_timer;
2091 	struct kvm_vcpu *vcpu = apic->vcpu;
2092 	bool expired;
2093 
2094 	WARN_ON(preemptible());
2095 	if (!kvm_can_use_hv_timer(vcpu))
2096 		return false;
2097 
2098 	if (!ktimer->tscdeadline)
2099 		return false;
2100 
2101 	if (kvm_x86_call(set_hv_timer)(vcpu, ktimer->tscdeadline, &expired))
2102 		return false;
2103 
2104 	ktimer->hv_timer_in_use = true;
2105 	hrtimer_cancel(&ktimer->timer);
2106 
2107 	/*
2108 	 * To simplify handling the periodic timer, leave the hv timer running
2109 	 * even if the deadline timer has expired, i.e. rely on the resulting
2110 	 * VM-Exit to recompute the periodic timer's target expiration.
2111 	 */
2112 	if (!apic_lvtt_period(apic)) {
2113 		/*
2114 		 * Cancel the hv timer if the sw timer fired while the hv timer
2115 		 * was being programmed, or if the hv timer itself expired.
2116 		 */
2117 		if (atomic_read(&ktimer->pending)) {
2118 			cancel_hv_timer(apic);
2119 		} else if (expired) {
2120 			apic_timer_expired(apic, false);
2121 			cancel_hv_timer(apic);
2122 		}
2123 	}
2124 
2125 	trace_kvm_hv_timer_state(vcpu->vcpu_id, ktimer->hv_timer_in_use);
2126 
2127 	return true;
2128 }
2129 
2130 static void start_sw_timer(struct kvm_lapic *apic)
2131 {
2132 	struct kvm_timer *ktimer = &apic->lapic_timer;
2133 
2134 	WARN_ON(preemptible());
2135 	if (apic->lapic_timer.hv_timer_in_use)
2136 		cancel_hv_timer(apic);
2137 	if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
2138 		return;
2139 
2140 	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
2141 		start_sw_period(apic);
2142 	else if (apic_lvtt_tscdeadline(apic))
2143 		start_sw_tscdeadline(apic);
2144 	trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false);
2145 }
2146 
2147 static void restart_apic_timer(struct kvm_lapic *apic)
2148 {
2149 	preempt_disable();
2150 
2151 	if (!apic_lvtt_period(apic) && atomic_read(&apic->lapic_timer.pending))
2152 		goto out;
2153 
2154 	if (!start_hv_timer(apic))
2155 		start_sw_timer(apic);
2156 out:
2157 	preempt_enable();
2158 }
2159 
2160 void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
2161 {
2162 	struct kvm_lapic *apic = vcpu->arch.apic;
2163 
2164 	preempt_disable();
2165 	/* If the preempt notifier has already run, it also called apic_timer_expired */
2166 	if (!apic->lapic_timer.hv_timer_in_use)
2167 		goto out;
2168 	WARN_ON(kvm_vcpu_is_blocking(vcpu));
2169 	apic_timer_expired(apic, false);
2170 	cancel_hv_timer(apic);
2171 
2172 	if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
2173 		advance_periodic_target_expiration(apic);
2174 		restart_apic_timer(apic);
2175 	}
2176 out:
2177 	preempt_enable();
2178 }
2179 EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
2180 
2181 void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
2182 {
2183 	restart_apic_timer(vcpu->arch.apic);
2184 }
2185 
2186 void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
2187 {
2188 	struct kvm_lapic *apic = vcpu->arch.apic;
2189 
2190 	preempt_disable();
2191 	/* Possibly the TSC deadline timer is not enabled yet */
2192 	if (apic->lapic_timer.hv_timer_in_use)
2193 		start_sw_timer(apic);
2194 	preempt_enable();
2195 }
2196 
2197 void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
2198 {
2199 	struct kvm_lapic *apic = vcpu->arch.apic;
2200 
2201 	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
2202 	restart_apic_timer(apic);
2203 }
2204 
2205 static void __start_apic_timer(struct kvm_lapic *apic, u32 count_reg)
2206 {
2207 	atomic_set(&apic->lapic_timer.pending, 0);
2208 
2209 	if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
2210 	    && !set_target_expiration(apic, count_reg))
2211 		return;
2212 
2213 	restart_apic_timer(apic);
2214 }
2215 
2216 static void start_apic_timer(struct kvm_lapic *apic)
2217 {
2218 	__start_apic_timer(apic, APIC_TMICT);
2219 }
2220 
2221 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
2222 {
2223 	bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val);
2224 
2225 	if (apic->lvt0_in_nmi_mode != lvt0_in_nmi_mode) {
2226 		apic->lvt0_in_nmi_mode = lvt0_in_nmi_mode;
2227 		if (lvt0_in_nmi_mode) {
2228 			atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
2229 		} else
2230 			atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
2231 	}
2232 }
2233 
2234 static int get_lvt_index(u32 reg)
2235 {
2236 	if (reg == APIC_LVTCMCI)
2237 		return LVT_CMCI;
2238 	if (reg < APIC_LVTT || reg > APIC_LVTERR)
2239 		return -1;
2240 	return array_index_nospec(
2241 			(reg - APIC_LVTT) >> 4, KVM_APIC_MAX_NR_LVT_ENTRIES);
2242 }
2243 
2244 static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
2245 {
2246 	int ret = 0;
2247 
2248 	trace_kvm_apic_write(reg, val);
2249 
2250 	switch (reg) {
2251 	case APIC_ID:		/* Local APIC ID */
2252 		if (!apic_x2apic_mode(apic)) {
2253 			kvm_apic_set_xapic_id(apic, val >> 24);
2254 		} else {
2255 			ret = 1;
2256 		}
2257 		break;
2258 
2259 	case APIC_TASKPRI:
2260 		report_tpr_access(apic, true);
2261 		apic_set_tpr(apic, val & 0xff);
2262 		break;
2263 
2264 	case APIC_EOI:
2265 		apic_set_eoi(apic);
2266 		break;
2267 
2268 	case APIC_LDR:
2269 		if (!apic_x2apic_mode(apic))
2270 			kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
2271 		else
2272 			ret = 1;
2273 		break;
2274 
2275 	case APIC_DFR:
2276 		if (!apic_x2apic_mode(apic))
2277 			kvm_apic_set_dfr(apic, val | 0x0FFFFFFF);
2278 		else
2279 			ret = 1;
2280 		break;
2281 
2282 	case APIC_SPIV: {
2283 		u32 mask = 0x3ff;
2284 		if (kvm_lapic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
2285 			mask |= APIC_SPIV_DIRECTED_EOI;
2286 		apic_set_spiv(apic, val & mask);
2287 		if (!(val & APIC_SPIV_APIC_ENABLED)) {
2288 			int i;
2289 
2290 			for (i = 0; i < apic->nr_lvt_entries; i++) {
2291 				kvm_lapic_set_reg(apic, APIC_LVTx(i),
2292 					kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED);
2293 			}
2294 			apic_update_lvtt(apic);
2295 			atomic_set(&apic->lapic_timer.pending, 0);
2296 
2297 		}
2298 		break;
2299 	}
2300 	case APIC_ICR:
2301 		WARN_ON_ONCE(apic_x2apic_mode(apic));
2302 
2303 		/* No delay here, so we always clear the pending bit */
2304 		val &= ~APIC_ICR_BUSY;
2305 		kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
2306 		kvm_lapic_set_reg(apic, APIC_ICR, val);
2307 		break;
2308 	case APIC_ICR2:
2309 		if (apic_x2apic_mode(apic))
2310 			ret = 1;
2311 		else
2312 			kvm_lapic_set_reg(apic, APIC_ICR2, val & 0xff000000);
2313 		break;
2314 
2315 	case APIC_LVT0:
2316 		apic_manage_nmi_watchdog(apic, val);
2317 		fallthrough;
2318 	case APIC_LVTTHMR:
2319 	case APIC_LVTPC:
2320 	case APIC_LVT1:
2321 	case APIC_LVTERR:
2322 	case APIC_LVTCMCI: {
2323 		u32 index = get_lvt_index(reg);
2324 		if (!kvm_lapic_lvt_supported(apic, index)) {
2325 			ret = 1;
2326 			break;
2327 		}
2328 		if (!kvm_apic_sw_enabled(apic))
2329 			val |= APIC_LVT_MASKED;
2330 		val &= apic_lvt_mask[index];
2331 		kvm_lapic_set_reg(apic, reg, val);
2332 		break;
2333 	}
2334 
2335 	case APIC_LVTT:
2336 		if (!kvm_apic_sw_enabled(apic))
2337 			val |= APIC_LVT_MASKED;
2338 		val &= (apic_lvt_mask[LVT_TIMER] | apic->lapic_timer.timer_mode_mask);
2339 		kvm_lapic_set_reg(apic, APIC_LVTT, val);
2340 		apic_update_lvtt(apic);
2341 		break;
2342 
2343 	case APIC_TMICT:
2344 		if (apic_lvtt_tscdeadline(apic))
2345 			break;
2346 
2347 		cancel_apic_timer(apic);
2348 		kvm_lapic_set_reg(apic, APIC_TMICT, val);
2349 		start_apic_timer(apic);
2350 		break;
2351 
2352 	case APIC_TDCR: {
2353 		uint32_t old_divisor = apic->divide_count;
2354 
2355 		kvm_lapic_set_reg(apic, APIC_TDCR, val & 0xb);
2356 		update_divide_count(apic);
2357 		if (apic->divide_count != old_divisor &&
2358 				apic->lapic_timer.period) {
2359 			hrtimer_cancel(&apic->lapic_timer.timer);
2360 			update_target_expiration(apic, old_divisor);
2361 			restart_apic_timer(apic);
2362 		}
2363 		break;
2364 	}
2365 	case APIC_ESR:
2366 		if (apic_x2apic_mode(apic) && val != 0)
2367 			ret = 1;
2368 		break;
2369 
2370 	case APIC_SELF_IPI:
2371 		/*
2372 		 * Self-IPI exists only when x2APIC is enabled.  Bits 7:0 hold
2373 		 * the vector, everything else is reserved.
2374 		 */
2375 		if (!apic_x2apic_mode(apic) || (val & ~APIC_VECTOR_MASK))
2376 			ret = 1;
2377 		else
2378 			kvm_apic_send_ipi(apic, APIC_DEST_SELF | val, 0);
2379 		break;
2380 	default:
2381 		ret = 1;
2382 		break;
2383 	}
2384 
2385 	/*
2386 	 * Recalculate APIC maps if necessary, e.g. if the software enable bit
2387 	 * was toggled, the APIC ID changed, etc...   The maps are marked dirty
2388 	 * on relevant changes, i.e. this is a nop for most writes.
2389 	 */
2390 	kvm_recalculate_apic_map(apic->vcpu->kvm);
2391 
2392 	return ret;
2393 }
2394 
2395 static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
2396 			    gpa_t address, int len, const void *data)
2397 {
2398 	struct kvm_lapic *apic = to_lapic(this);
2399 	unsigned int offset = address - apic->base_address;
2400 	u32 val;
2401 
2402 	if (!apic_mmio_in_range(apic, address))
2403 		return -EOPNOTSUPP;
2404 
2405 	if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
2406 		if (!kvm_check_has_quirk(vcpu->kvm,
2407 					 KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
2408 			return -EOPNOTSUPP;
2409 
2410 		return 0;
2411 	}
2412 
2413 	/*
2414 	 * APIC register must be aligned on 128-bits boundary.
2415 	 * 32/64/128 bits registers must be accessed thru 32 bits.
2416 	 * Refer SDM 8.4.1
2417 	 */
2418 	if (len != 4 || (offset & 0xf))
2419 		return 0;
2420 
2421 	val = *(u32*)data;
2422 
2423 	kvm_lapic_reg_write(apic, offset & 0xff0, val);
2424 
2425 	return 0;
2426 }
2427 
2428 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
2429 {
2430 	kvm_lapic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
2431 }
2432 EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
2433 
2434 #define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13))
2435 
2436 int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
2437 {
2438 	if (data & X2APIC_ICR_RESERVED_BITS)
2439 		return 1;
2440 
2441 	/*
2442 	 * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but
2443 	 * only AMD requires it to be zero, Intel essentially just ignores the
2444 	 * bit.  And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled,
2445 	 * the CPU performs the reserved bits checks, i.e. the underlying CPU
2446 	 * behavior will "win".  Arbitrarily clear the BUSY bit, as there is no
2447 	 * sane way to provide consistent behavior with respect to hardware.
2448 	 */
2449 	data &= ~APIC_ICR_BUSY;
2450 
2451 	kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
2452 	if (kvm_x86_ops.x2apic_icr_is_split) {
2453 		kvm_lapic_set_reg(apic, APIC_ICR, data);
2454 		kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
2455 	} else {
2456 		kvm_lapic_set_reg64(apic, APIC_ICR, data);
2457 	}
2458 	trace_kvm_apic_write(APIC_ICR, data);
2459 	return 0;
2460 }
2461 
2462 static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
2463 {
2464 	if (kvm_x86_ops.x2apic_icr_is_split)
2465 		return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
2466 		       (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
2467 
2468 	return kvm_lapic_get_reg64(apic, APIC_ICR);
2469 }
2470 
2471 /* emulate APIC access in a trap manner */
2472 void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
2473 {
2474 	struct kvm_lapic *apic = vcpu->arch.apic;
2475 
2476 	/*
2477 	 * ICR is a single 64-bit register when x2APIC is enabled, all others
2478 	 * registers hold 32-bit values.  For legacy xAPIC, ICR writes need to
2479 	 * go down the common path to get the upper half from ICR2.
2480 	 *
2481 	 * Note, using the write helpers may incur an unnecessary write to the
2482 	 * virtual APIC state, but KVM needs to conditionally modify the value
2483 	 * in certain cases, e.g. to clear the ICR busy bit.  The cost of extra
2484 	 * conditional branches is likely a wash relative to the cost of the
2485 	 * maybe-unecessary write, and both are in the noise anyways.
2486 	 */
2487 	if (apic_x2apic_mode(apic) && offset == APIC_ICR)
2488 		WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
2489 	else
2490 		kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
2491 }
2492 EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
2493 
2494 void kvm_free_lapic(struct kvm_vcpu *vcpu)
2495 {
2496 	struct kvm_lapic *apic = vcpu->arch.apic;
2497 
2498 	if (!vcpu->arch.apic) {
2499 		static_branch_dec(&kvm_has_noapic_vcpu);
2500 		return;
2501 	}
2502 
2503 	hrtimer_cancel(&apic->lapic_timer.timer);
2504 
2505 	if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
2506 		static_branch_slow_dec_deferred(&apic_hw_disabled);
2507 
2508 	if (!apic->sw_enabled)
2509 		static_branch_slow_dec_deferred(&apic_sw_disabled);
2510 
2511 	if (apic->regs)
2512 		free_page((unsigned long)apic->regs);
2513 
2514 	kfree(apic);
2515 }
2516 
2517 /*
2518  *----------------------------------------------------------------------
2519  * LAPIC interface
2520  *----------------------------------------------------------------------
2521  */
2522 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
2523 {
2524 	struct kvm_lapic *apic = vcpu->arch.apic;
2525 
2526 	if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
2527 		return 0;
2528 
2529 	return apic->lapic_timer.tscdeadline;
2530 }
2531 
2532 void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
2533 {
2534 	struct kvm_lapic *apic = vcpu->arch.apic;
2535 
2536 	if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
2537 		return;
2538 
2539 	hrtimer_cancel(&apic->lapic_timer.timer);
2540 	apic->lapic_timer.tscdeadline = data;
2541 	start_apic_timer(apic);
2542 }
2543 
2544 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
2545 {
2546 	apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4);
2547 }
2548 
2549 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
2550 {
2551 	u64 tpr;
2552 
2553 	tpr = (u64) kvm_lapic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
2554 
2555 	return (tpr & 0xf0) >> 4;
2556 }
2557 
2558 static void __kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value)
2559 {
2560 	u64 old_value = vcpu->arch.apic_base;
2561 	struct kvm_lapic *apic = vcpu->arch.apic;
2562 
2563 	vcpu->arch.apic_base = value;
2564 
2565 	if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
2566 		vcpu->arch.cpuid_dynamic_bits_dirty = true;
2567 
2568 	if (!apic)
2569 		return;
2570 
2571 	/* update jump label if enable bit changes */
2572 	if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
2573 		if (value & MSR_IA32_APICBASE_ENABLE) {
2574 			kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
2575 			static_branch_slow_dec_deferred(&apic_hw_disabled);
2576 			/* Check if there are APF page ready requests pending */
2577 			kvm_make_request(KVM_REQ_APF_READY, vcpu);
2578 		} else {
2579 			static_branch_inc(&apic_hw_disabled.key);
2580 			atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
2581 		}
2582 	}
2583 
2584 	if ((old_value ^ value) & X2APIC_ENABLE) {
2585 		if (value & X2APIC_ENABLE)
2586 			kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
2587 		else if (value & MSR_IA32_APICBASE_ENABLE)
2588 			kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
2589 	}
2590 
2591 	if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
2592 		kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
2593 		kvm_x86_call(set_virtual_apic_mode)(vcpu);
2594 	}
2595 
2596 	apic->base_address = apic->vcpu->arch.apic_base &
2597 			     MSR_IA32_APICBASE_BASE;
2598 
2599 	if ((value & MSR_IA32_APICBASE_ENABLE) &&
2600 	     apic->base_address != APIC_DEFAULT_PHYS_BASE) {
2601 		kvm_set_apicv_inhibit(apic->vcpu->kvm,
2602 				      APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
2603 	}
2604 }
2605 
2606 int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated)
2607 {
2608 	enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
2609 	enum lapic_mode new_mode = kvm_apic_mode(value);
2610 
2611 	if (vcpu->arch.apic_base == value)
2612 		return 0;
2613 
2614 	u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
2615 		(guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
2616 
2617 	if ((value & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
2618 		return 1;
2619 	if (!host_initiated) {
2620 		if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
2621 			return 1;
2622 		if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
2623 			return 1;
2624 	}
2625 
2626 	__kvm_apic_set_base(vcpu, value);
2627 	kvm_recalculate_apic_map(vcpu->kvm);
2628 	return 0;
2629 }
2630 EXPORT_SYMBOL_GPL(kvm_apic_set_base);
2631 
2632 void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
2633 {
2634 	struct kvm_lapic *apic = vcpu->arch.apic;
2635 
2636 	/*
2637 	 * When APICv is enabled, KVM must always search the IRR for a pending
2638 	 * IRQ, as other vCPUs and devices can set IRR bits even if the vCPU
2639 	 * isn't running.  If APICv is disabled, KVM _should_ search the IRR
2640 	 * for a pending IRQ.  But KVM currently doesn't ensure *all* hardware,
2641 	 * e.g. CPUs and IOMMUs, has seen the change in state, i.e. searching
2642 	 * the IRR at this time could race with IRQ delivery from hardware that
2643 	 * still sees APICv as being enabled.
2644 	 *
2645 	 * FIXME: Ensure other vCPUs and devices observe the change in APICv
2646 	 *        state prior to updating KVM's metadata caches, so that KVM
2647 	 *        can safely search the IRR and set irr_pending accordingly.
2648 	 */
2649 	apic->irr_pending = true;
2650 
2651 	if (apic->apicv_active)
2652 		apic->isr_count = 1;
2653 	else
2654 		apic->isr_count = count_vectors(apic->regs + APIC_ISR);
2655 
2656 	apic->highest_isr_cache = -1;
2657 }
2658 
2659 int kvm_alloc_apic_access_page(struct kvm *kvm)
2660 {
2661 	void __user *hva;
2662 	int ret = 0;
2663 
2664 	mutex_lock(&kvm->slots_lock);
2665 	if (kvm->arch.apic_access_memslot_enabled ||
2666 	    kvm->arch.apic_access_memslot_inhibited)
2667 		goto out;
2668 
2669 	hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
2670 				      APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
2671 	if (IS_ERR(hva)) {
2672 		ret = PTR_ERR(hva);
2673 		goto out;
2674 	}
2675 
2676 	kvm->arch.apic_access_memslot_enabled = true;
2677 out:
2678 	mutex_unlock(&kvm->slots_lock);
2679 	return ret;
2680 }
2681 EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page);
2682 
2683 void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu)
2684 {
2685 	struct kvm *kvm = vcpu->kvm;
2686 
2687 	if (!kvm->arch.apic_access_memslot_enabled)
2688 		return;
2689 
2690 	kvm_vcpu_srcu_read_unlock(vcpu);
2691 
2692 	mutex_lock(&kvm->slots_lock);
2693 
2694 	if (kvm->arch.apic_access_memslot_enabled) {
2695 		__x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
2696 		/*
2697 		 * Clear "enabled" after the memslot is deleted so that a
2698 		 * different vCPU doesn't get a false negative when checking
2699 		 * the flag out of slots_lock.  No additional memory barrier is
2700 		 * needed as modifying memslots requires waiting other vCPUs to
2701 		 * drop SRCU (see above), and false positives are ok as the
2702 		 * flag is rechecked after acquiring slots_lock.
2703 		 */
2704 		kvm->arch.apic_access_memslot_enabled = false;
2705 
2706 		/*
2707 		 * Mark the memslot as inhibited to prevent reallocating the
2708 		 * memslot during vCPU creation, e.g. if a vCPU is hotplugged.
2709 		 */
2710 		kvm->arch.apic_access_memslot_inhibited = true;
2711 	}
2712 
2713 	mutex_unlock(&kvm->slots_lock);
2714 
2715 	kvm_vcpu_srcu_read_lock(vcpu);
2716 }
2717 
2718 void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
2719 {
2720 	struct kvm_lapic *apic = vcpu->arch.apic;
2721 	u64 msr_val;
2722 	int i;
2723 
2724 	kvm_x86_call(apicv_pre_state_restore)(vcpu);
2725 
2726 	if (!init_event) {
2727 		msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
2728 		if (kvm_vcpu_is_reset_bsp(vcpu))
2729 			msr_val |= MSR_IA32_APICBASE_BSP;
2730 
2731 		/*
2732 		 * Use the inner helper to avoid an extra recalcuation of the
2733 		 * optimized APIC map if some other task has dirtied the map.
2734 		 * The recalculation needed for this vCPU will be done after
2735 		 * all APIC state has been initialized (see below).
2736 		 */
2737 		__kvm_apic_set_base(vcpu, msr_val);
2738 	}
2739 
2740 	if (!apic)
2741 		return;
2742 
2743 	/* Stop the timer in case it's a reset to an active apic */
2744 	hrtimer_cancel(&apic->lapic_timer.timer);
2745 
2746 	/* The xAPIC ID is set at RESET even if the APIC was already enabled. */
2747 	if (!init_event)
2748 		kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
2749 	kvm_apic_set_version(apic->vcpu);
2750 
2751 	for (i = 0; i < apic->nr_lvt_entries; i++)
2752 		kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
2753 	apic_update_lvtt(apic);
2754 	if (kvm_vcpu_is_reset_bsp(vcpu) &&
2755 	    kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
2756 		kvm_lapic_set_reg(apic, APIC_LVT0,
2757 			     SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
2758 	apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
2759 
2760 	kvm_apic_set_dfr(apic, 0xffffffffU);
2761 	apic_set_spiv(apic, 0xff);
2762 	kvm_lapic_set_reg(apic, APIC_TASKPRI, 0);
2763 	if (!apic_x2apic_mode(apic))
2764 		kvm_apic_set_ldr(apic, 0);
2765 	kvm_lapic_set_reg(apic, APIC_ESR, 0);
2766 	if (!apic_x2apic_mode(apic)) {
2767 		kvm_lapic_set_reg(apic, APIC_ICR, 0);
2768 		kvm_lapic_set_reg(apic, APIC_ICR2, 0);
2769 	} else {
2770 		kvm_lapic_set_reg64(apic, APIC_ICR, 0);
2771 	}
2772 	kvm_lapic_set_reg(apic, APIC_TDCR, 0);
2773 	kvm_lapic_set_reg(apic, APIC_TMICT, 0);
2774 	for (i = 0; i < 8; i++) {
2775 		kvm_lapic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
2776 		kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
2777 		kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
2778 	}
2779 	kvm_apic_update_apicv(vcpu);
2780 	update_divide_count(apic);
2781 	atomic_set(&apic->lapic_timer.pending, 0);
2782 
2783 	vcpu->arch.pv_eoi.msr_val = 0;
2784 	apic_update_ppr(apic);
2785 	if (apic->apicv_active) {
2786 		kvm_x86_call(apicv_post_state_restore)(vcpu);
2787 		kvm_x86_call(hwapic_isr_update)(vcpu, -1);
2788 	}
2789 
2790 	vcpu->arch.apic_arb_prio = 0;
2791 	vcpu->arch.apic_attention = 0;
2792 
2793 	kvm_recalculate_apic_map(vcpu->kvm);
2794 }
2795 
2796 /*
2797  *----------------------------------------------------------------------
2798  * timer interface
2799  *----------------------------------------------------------------------
2800  */
2801 
2802 static bool lapic_is_periodic(struct kvm_lapic *apic)
2803 {
2804 	return apic_lvtt_period(apic);
2805 }
2806 
2807 int apic_has_pending_timer(struct kvm_vcpu *vcpu)
2808 {
2809 	struct kvm_lapic *apic = vcpu->arch.apic;
2810 
2811 	if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
2812 		return atomic_read(&apic->lapic_timer.pending);
2813 
2814 	return 0;
2815 }
2816 
2817 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
2818 {
2819 	u32 reg = kvm_lapic_get_reg(apic, lvt_type);
2820 	int vector, mode, trig_mode;
2821 	int r;
2822 
2823 	if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
2824 		vector = reg & APIC_VECTOR_MASK;
2825 		mode = reg & APIC_MODE_MASK;
2826 		trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
2827 
2828 		r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL);
2829 		if (r && lvt_type == APIC_LVTPC &&
2830 		    guest_cpuid_is_intel_compatible(apic->vcpu))
2831 			kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED);
2832 		return r;
2833 	}
2834 	return 0;
2835 }
2836 
2837 void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
2838 {
2839 	struct kvm_lapic *apic = vcpu->arch.apic;
2840 
2841 	if (apic)
2842 		kvm_apic_local_deliver(apic, APIC_LVT0);
2843 }
2844 
2845 static const struct kvm_io_device_ops apic_mmio_ops = {
2846 	.read     = apic_mmio_read,
2847 	.write    = apic_mmio_write,
2848 };
2849 
2850 static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
2851 {
2852 	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
2853 	struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
2854 
2855 	apic_timer_expired(apic, true);
2856 
2857 	if (lapic_is_periodic(apic)) {
2858 		advance_periodic_target_expiration(apic);
2859 		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
2860 		return HRTIMER_RESTART;
2861 	} else
2862 		return HRTIMER_NORESTART;
2863 }
2864 
2865 int kvm_create_lapic(struct kvm_vcpu *vcpu)
2866 {
2867 	struct kvm_lapic *apic;
2868 
2869 	ASSERT(vcpu != NULL);
2870 
2871 	if (!irqchip_in_kernel(vcpu->kvm)) {
2872 		static_branch_inc(&kvm_has_noapic_vcpu);
2873 		return 0;
2874 	}
2875 
2876 	apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT);
2877 	if (!apic)
2878 		goto nomem;
2879 
2880 	vcpu->arch.apic = apic;
2881 
2882 	if (kvm_x86_ops.alloc_apic_backing_page)
2883 		apic->regs = kvm_x86_call(alloc_apic_backing_page)(vcpu);
2884 	else
2885 		apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
2886 	if (!apic->regs) {
2887 		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
2888 		       vcpu->vcpu_id);
2889 		goto nomem_free_apic;
2890 	}
2891 	apic->vcpu = vcpu;
2892 
2893 	apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
2894 
2895 	hrtimer_setup(&apic->lapic_timer.timer, apic_timer_fn, CLOCK_MONOTONIC,
2896 		      HRTIMER_MODE_ABS_HARD);
2897 	if (lapic_timer_advance)
2898 		apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
2899 
2900 	/*
2901 	 * Stuff the APIC ENABLE bit in lieu of temporarily incrementing
2902 	 * apic_hw_disabled; the full RESET value is set by kvm_lapic_reset().
2903 	 */
2904 	vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
2905 	static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
2906 	kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
2907 
2908 	/*
2909 	 * Defer evaluating inhibits until the vCPU is first run, as this vCPU
2910 	 * will not get notified of any changes until this vCPU is visible to
2911 	 * other vCPUs (marked online and added to the set of vCPUs).
2912 	 *
2913 	 * Opportunistically mark APICv active as VMX in particularly is highly
2914 	 * unlikely to have inhibits.  Ignore the current per-VM APICv state so
2915 	 * that vCPU creation is guaranteed to run with a deterministic value,
2916 	 * the request will ensure the vCPU gets the correct state before VM-Entry.
2917 	 */
2918 	if (enable_apicv) {
2919 		apic->apicv_active = true;
2920 		kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
2921 	}
2922 
2923 	return 0;
2924 nomem_free_apic:
2925 	kfree(apic);
2926 	vcpu->arch.apic = NULL;
2927 nomem:
2928 	return -ENOMEM;
2929 }
2930 
2931 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
2932 {
2933 	struct kvm_lapic *apic = vcpu->arch.apic;
2934 	u32 ppr;
2935 
2936 	if (!kvm_apic_present(vcpu))
2937 		return -1;
2938 
2939 	if (apic->guest_apic_protected)
2940 		return -1;
2941 
2942 	__apic_update_ppr(apic, &ppr);
2943 	return apic_has_interrupt_for_ppr(apic, ppr);
2944 }
2945 EXPORT_SYMBOL_GPL(kvm_apic_has_interrupt);
2946 
2947 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
2948 {
2949 	u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0);
2950 
2951 	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
2952 		return 1;
2953 	if ((lvt0 & APIC_LVT_MASKED) == 0 &&
2954 	    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
2955 		return 1;
2956 	return 0;
2957 }
2958 
2959 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
2960 {
2961 	struct kvm_lapic *apic = vcpu->arch.apic;
2962 
2963 	if (atomic_read(&apic->lapic_timer.pending) > 0) {
2964 		kvm_apic_inject_pending_timer_irqs(apic);
2965 		atomic_set(&apic->lapic_timer.pending, 0);
2966 	}
2967 }
2968 
2969 void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector)
2970 {
2971 	struct kvm_lapic *apic = vcpu->arch.apic;
2972 	u32 ppr;
2973 
2974 	if (WARN_ON_ONCE(vector < 0 || !apic))
2975 		return;
2976 
2977 	/*
2978 	 * We get here even with APIC virtualization enabled, if doing
2979 	 * nested virtualization and L1 runs with the "acknowledge interrupt
2980 	 * on exit" mode.  Then we cannot inject the interrupt via RVI,
2981 	 * because the process would deliver it through the IDT.
2982 	 */
2983 
2984 	apic_clear_irr(vector, apic);
2985 	if (kvm_hv_synic_auto_eoi_set(vcpu, vector)) {
2986 		/*
2987 		 * For auto-EOI interrupts, there might be another pending
2988 		 * interrupt above PPR, so check whether to raise another
2989 		 * KVM_REQ_EVENT.
2990 		 */
2991 		apic_update_ppr(apic);
2992 	} else {
2993 		/*
2994 		 * For normal interrupts, PPR has been raised and there cannot
2995 		 * be a higher-priority pending interrupt---except if there was
2996 		 * a concurrent interrupt injection, but that would have
2997 		 * triggered KVM_REQ_EVENT already.
2998 		 */
2999 		apic_set_isr(vector, apic);
3000 		__apic_update_ppr(apic, &ppr);
3001 	}
3002 
3003 }
3004 EXPORT_SYMBOL_GPL(kvm_apic_ack_interrupt);
3005 
3006 static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
3007 		struct kvm_lapic_state *s, bool set)
3008 {
3009 	if (apic_x2apic_mode(vcpu->arch.apic)) {
3010 		u32 x2apic_id = kvm_x2apic_id(vcpu->arch.apic);
3011 		u32 *id = (u32 *)(s->regs + APIC_ID);
3012 		u32 *ldr = (u32 *)(s->regs + APIC_LDR);
3013 		u64 icr;
3014 
3015 		if (vcpu->kvm->arch.x2apic_format) {
3016 			if (*id != x2apic_id)
3017 				return -EINVAL;
3018 		} else {
3019 			/*
3020 			 * Ignore the userspace value when setting APIC state.
3021 			 * KVM's model is that the x2APIC ID is readonly, e.g.
3022 			 * KVM only supports delivering interrupts to KVM's
3023 			 * version of the x2APIC ID.  However, for backwards
3024 			 * compatibility, don't reject attempts to set a
3025 			 * mismatched ID for userspace that hasn't opted into
3026 			 * x2apic_format.
3027 			 */
3028 			if (set)
3029 				*id = x2apic_id;
3030 			else
3031 				*id = x2apic_id << 24;
3032 		}
3033 
3034 		/*
3035 		 * In x2APIC mode, the LDR is fixed and based on the id.  And
3036 		 * if the ICR is _not_ split, ICR is internally a single 64-bit
3037 		 * register, but needs to be split to ICR+ICR2 in userspace for
3038 		 * backwards compatibility.
3039 		 */
3040 		if (set)
3041 			*ldr = kvm_apic_calc_x2apic_ldr(x2apic_id);
3042 
3043 		if (!kvm_x86_ops.x2apic_icr_is_split) {
3044 			if (set) {
3045 				icr = apic_get_reg(s->regs, APIC_ICR) |
3046 				      (u64)apic_get_reg(s->regs, APIC_ICR2) << 32;
3047 				apic_set_reg64(s->regs, APIC_ICR, icr);
3048 			} else {
3049 				icr = apic_get_reg64(s->regs, APIC_ICR);
3050 				apic_set_reg(s->regs, APIC_ICR2, icr >> 32);
3051 			}
3052 		}
3053 	}
3054 
3055 	return 0;
3056 }
3057 
3058 int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
3059 {
3060 	memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
3061 
3062 	/*
3063 	 * Get calculated timer current count for remaining timer period (if
3064 	 * any) and store it in the returned register set.
3065 	 */
3066 	apic_set_reg(s->regs, APIC_TMCCT, __apic_read(vcpu->arch.apic, APIC_TMCCT));
3067 
3068 	return kvm_apic_state_fixup(vcpu, s, false);
3069 }
3070 
3071 int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
3072 {
3073 	struct kvm_lapic *apic = vcpu->arch.apic;
3074 	int r;
3075 
3076 	kvm_x86_call(apicv_pre_state_restore)(vcpu);
3077 
3078 	/* set SPIV separately to get count of SW disabled APICs right */
3079 	apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
3080 
3081 	r = kvm_apic_state_fixup(vcpu, s, true);
3082 	if (r) {
3083 		kvm_recalculate_apic_map(vcpu->kvm);
3084 		return r;
3085 	}
3086 	memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
3087 
3088 	atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
3089 	kvm_recalculate_apic_map(vcpu->kvm);
3090 	kvm_apic_set_version(vcpu);
3091 
3092 	apic_update_ppr(apic);
3093 	cancel_apic_timer(apic);
3094 	apic->lapic_timer.expired_tscdeadline = 0;
3095 	apic_update_lvtt(apic);
3096 	apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
3097 	update_divide_count(apic);
3098 	__start_apic_timer(apic, APIC_TMCCT);
3099 	kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
3100 	kvm_apic_update_apicv(vcpu);
3101 	if (apic->apicv_active) {
3102 		kvm_x86_call(apicv_post_state_restore)(vcpu);
3103 		kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
3104 	}
3105 	kvm_make_request(KVM_REQ_EVENT, vcpu);
3106 
3107 #ifdef CONFIG_KVM_IOAPIC
3108 	if (ioapic_in_kernel(vcpu->kvm))
3109 		kvm_rtc_eoi_tracking_restore_one(vcpu);
3110 #endif
3111 
3112 	vcpu->arch.apic_arb_prio = 0;
3113 
3114 	return 0;
3115 }
3116 
3117 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
3118 {
3119 	struct hrtimer *timer;
3120 
3121 	if (!lapic_in_kernel(vcpu) ||
3122 		kvm_can_post_timer_interrupt(vcpu))
3123 		return;
3124 
3125 	timer = &vcpu->arch.apic->lapic_timer.timer;
3126 	if (hrtimer_cancel(timer))
3127 		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_HARD);
3128 }
3129 
3130 /*
3131  * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt
3132  *
3133  * Detect whether guest triggered PV EOI since the
3134  * last entry. If yes, set EOI on guests's behalf.
3135  * Clear PV EOI in guest memory in any case.
3136  */
3137 static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
3138 					struct kvm_lapic *apic)
3139 {
3140 	int vector;
3141 	/*
3142 	 * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
3143 	 * and KVM_PV_EOI_ENABLED in guest memory as follows:
3144 	 *
3145 	 * KVM_APIC_PV_EOI_PENDING is unset:
3146 	 * 	-> host disabled PV EOI.
3147 	 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set:
3148 	 * 	-> host enabled PV EOI, guest did not execute EOI yet.
3149 	 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset:
3150 	 * 	-> host enabled PV EOI, guest executed EOI.
3151 	 */
3152 	BUG_ON(!pv_eoi_enabled(vcpu));
3153 
3154 	if (pv_eoi_test_and_clr_pending(vcpu))
3155 		return;
3156 	vector = apic_set_eoi(apic);
3157 	trace_kvm_pv_eoi(apic, vector);
3158 }
3159 
3160 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
3161 {
3162 	u32 data;
3163 
3164 	if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
3165 		apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic);
3166 
3167 	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
3168 		return;
3169 
3170 	if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
3171 				  sizeof(u32)))
3172 		return;
3173 
3174 	apic_set_tpr(vcpu->arch.apic, data & 0xff);
3175 }
3176 
3177 /*
3178  * apic_sync_pv_eoi_to_guest - called before vmentry
3179  *
3180  * Detect whether it's safe to enable PV EOI and
3181  * if yes do so.
3182  */
3183 static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
3184 					struct kvm_lapic *apic)
3185 {
3186 	if (!pv_eoi_enabled(vcpu) ||
3187 	    /* IRR set or many bits in ISR: could be nested. */
3188 	    apic->irr_pending ||
3189 	    /* Cache not set: could be safe but we don't bother. */
3190 	    apic->highest_isr_cache == -1 ||
3191 	    /* Need EOI to update ioapic. */
3192 	    kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
3193 		/*
3194 		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
3195 		 * so we need not do anything here.
3196 		 */
3197 		return;
3198 	}
3199 
3200 	pv_eoi_set_pending(apic->vcpu);
3201 }
3202 
3203 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
3204 {
3205 	u32 data, tpr;
3206 	int max_irr, max_isr;
3207 	struct kvm_lapic *apic = vcpu->arch.apic;
3208 
3209 	apic_sync_pv_eoi_to_guest(vcpu, apic);
3210 
3211 	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
3212 		return;
3213 
3214 	tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI) & 0xff;
3215 	max_irr = apic_find_highest_irr(apic);
3216 	if (max_irr < 0)
3217 		max_irr = 0;
3218 	max_isr = apic_find_highest_isr(apic);
3219 	if (max_isr < 0)
3220 		max_isr = 0;
3221 	data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
3222 
3223 	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
3224 				sizeof(u32));
3225 }
3226 
3227 int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
3228 {
3229 	if (vapic_addr) {
3230 		if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
3231 					&vcpu->arch.apic->vapic_cache,
3232 					vapic_addr, sizeof(u32)))
3233 			return -EINVAL;
3234 		__set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
3235 	} else {
3236 		__clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
3237 	}
3238 
3239 	vcpu->arch.apic->vapic_addr = vapic_addr;
3240 	return 0;
3241 }
3242 
3243 static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
3244 {
3245 	u32 low;
3246 
3247 	if (reg == APIC_ICR) {
3248 		*data = kvm_x2apic_icr_read(apic);
3249 		return 0;
3250 	}
3251 
3252 	if (kvm_lapic_reg_read(apic, reg, 4, &low))
3253 		return 1;
3254 
3255 	*data = low;
3256 
3257 	return 0;
3258 }
3259 
3260 static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data)
3261 {
3262 	/*
3263 	 * ICR is a 64-bit register in x2APIC mode (and Hyper-V PV vAPIC) and
3264 	 * can be written as such, all other registers remain accessible only
3265 	 * through 32-bit reads/writes.
3266 	 */
3267 	if (reg == APIC_ICR)
3268 		return kvm_x2apic_icr_write(apic, data);
3269 
3270 	/* Bits 63:32 are reserved in all other registers. */
3271 	if (data >> 32)
3272 		return 1;
3273 
3274 	return kvm_lapic_reg_write(apic, reg, (u32)data);
3275 }
3276 
3277 int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
3278 {
3279 	struct kvm_lapic *apic = vcpu->arch.apic;
3280 	u32 reg = (msr - APIC_BASE_MSR) << 4;
3281 
3282 	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
3283 		return 1;
3284 
3285 	return kvm_lapic_msr_write(apic, reg, data);
3286 }
3287 
3288 int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
3289 {
3290 	struct kvm_lapic *apic = vcpu->arch.apic;
3291 	u32 reg = (msr - APIC_BASE_MSR) << 4;
3292 
3293 	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
3294 		return 1;
3295 
3296 	return kvm_lapic_msr_read(apic, reg, data);
3297 }
3298 
3299 int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
3300 {
3301 	if (!lapic_in_kernel(vcpu))
3302 		return 1;
3303 
3304 	return kvm_lapic_msr_write(vcpu->arch.apic, reg, data);
3305 }
3306 
3307 int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
3308 {
3309 	if (!lapic_in_kernel(vcpu))
3310 		return 1;
3311 
3312 	return kvm_lapic_msr_read(vcpu->arch.apic, reg, data);
3313 }
3314 
3315 int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
3316 {
3317 	u64 addr = data & ~KVM_MSR_ENABLED;
3318 	struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data;
3319 	unsigned long new_len;
3320 	int ret;
3321 
3322 	if (!IS_ALIGNED(addr, 4))
3323 		return 1;
3324 
3325 	if (data & KVM_MSR_ENABLED) {
3326 		if (addr == ghc->gpa && len <= ghc->len)
3327 			new_len = ghc->len;
3328 		else
3329 			new_len = len;
3330 
3331 		ret = kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
3332 		if (ret)
3333 			return ret;
3334 	}
3335 
3336 	vcpu->arch.pv_eoi.msr_val = data;
3337 
3338 	return 0;
3339 }
3340 
3341 int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
3342 {
3343 	struct kvm_lapic *apic = vcpu->arch.apic;
3344 	u8 sipi_vector;
3345 	int r;
3346 
3347 	if (!kvm_apic_has_pending_init_or_sipi(vcpu))
3348 		return 0;
3349 
3350 	if (is_guest_mode(vcpu)) {
3351 		r = kvm_check_nested_events(vcpu);
3352 		if (r < 0)
3353 			return r == -EBUSY ? 0 : r;
3354 		/*
3355 		 * Continue processing INIT/SIPI even if a nested VM-Exit
3356 		 * occurred, e.g. pending SIPIs should be dropped if INIT+SIPI
3357 		 * are blocked as a result of transitioning to VMX root mode.
3358 		 */
3359 	}
3360 
3361 	/*
3362 	 * INITs are blocked while CPU is in specific states (SMM, VMX root
3363 	 * mode, SVM with GIF=0), while SIPIs are dropped if the CPU isn't in
3364 	 * wait-for-SIPI (WFS).
3365 	 */
3366 	if (!kvm_apic_init_sipi_allowed(vcpu)) {
3367 		WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
3368 		clear_bit(KVM_APIC_SIPI, &apic->pending_events);
3369 		return 0;
3370 	}
3371 
3372 	if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) {
3373 		kvm_vcpu_reset(vcpu, true);
3374 		if (kvm_vcpu_is_bsp(apic->vcpu))
3375 			kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
3376 		else
3377 			kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
3378 	}
3379 	if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events)) {
3380 		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
3381 			/* evaluate pending_events before reading the vector */
3382 			smp_rmb();
3383 			sipi_vector = apic->sipi_vector;
3384 			kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu,
3385 							       sipi_vector);
3386 			kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
3387 		}
3388 	}
3389 	return 0;
3390 }
3391 
3392 void kvm_lapic_exit(void)
3393 {
3394 	static_key_deferred_flush(&apic_hw_disabled);
3395 	WARN_ON(static_branch_unlikely(&apic_hw_disabled.key));
3396 	static_key_deferred_flush(&apic_sw_disabled);
3397 	WARN_ON(static_branch_unlikely(&apic_sw_disabled.key));
3398 }
3399