xref: /linux/arch/x86/kernel/kvm.c (revision d639d9fa162aadec1ae9980c4dcf6e50bd2f8290)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * KVM paravirt_ops implementation
4  *
5  * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6  * Copyright IBM Corporation, 2007
7  *   Authors: Anthony Liguori <aliguori@us.ibm.com>
8  */
9 
10 #define pr_fmt(fmt) "kvm-guest: " fmt
11 
12 #include <linux/context_tracking.h>
13 #include <linux/init.h>
14 #include <linux/irq.h>
15 #include <linux/kernel.h>
16 #include <linux/kvm_para.h>
17 #include <linux/cpu.h>
18 #include <linux/mm.h>
19 #include <linux/highmem.h>
20 #include <linux/hardirq.h>
21 #include <linux/notifier.h>
22 #include <linux/reboot.h>
23 #include <linux/hash.h>
24 #include <linux/sched.h>
25 #include <linux/slab.h>
26 #include <linux/kprobes.h>
27 #include <linux/nmi.h>
28 #include <linux/swait.h>
29 #include <linux/syscore_ops.h>
30 #include <linux/cc_platform.h>
31 #include <linux/efi.h>
32 #include <linux/kvm_types.h>
33 #include <linux/sched/cputime.h>
34 #include <asm/timer.h>
35 #include <asm/cpu.h>
36 #include <asm/traps.h>
37 #include <asm/desc.h>
38 #include <asm/tlbflush.h>
39 #include <asm/apic.h>
40 #include <asm/apicdef.h>
41 #include <asm/hypervisor.h>
42 #include <asm/mtrr.h>
43 #include <asm/tlb.h>
44 #include <asm/cpuid/api.h>
45 #include <asm/cpuidle_haltpoll.h>
46 #include <asm/msr.h>
47 #include <asm/ptrace.h>
48 #include <asm/reboot.h>
49 #include <asm/svm.h>
50 #include <asm/e820/api.h>
51 
52 DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled);
53 
54 static int kvmapf = 1;
55 
56 static int __init parse_no_kvmapf(char *arg)
57 {
58         kvmapf = 0;
59         return 0;
60 }
61 
62 early_param("no-kvmapf", parse_no_kvmapf);
63 
64 static int steal_acc = 1;
65 static int __init parse_no_stealacc(char *arg)
66 {
67         steal_acc = 0;
68         return 0;
69 }
70 
71 early_param("no-steal-acc", parse_no_stealacc);
72 
73 static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
74 static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
75 DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
76 static int has_steal_clock = 0;
77 
78 static int has_guest_poll = 0;
79 
80 #define KVM_TASK_SLEEP_HASHBITS 8
81 #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
82 
83 struct kvm_task_sleep_node {
84 	struct hlist_node link;
85 	struct swait_queue_head wq;
86 	u32 token;
87 	int cpu;
88 	bool dummy;
89 };
90 
91 static struct kvm_task_sleep_head {
92 	raw_spinlock_t lock;
93 	struct hlist_head list;
94 } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
95 
96 static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
97 						  u32 token)
98 {
99 	struct hlist_node *p;
100 
101 	hlist_for_each(p, &b->list) {
102 		struct kvm_task_sleep_node *n =
103 			hlist_entry(p, typeof(*n), link);
104 		if (n->token == token)
105 			return n;
106 	}
107 
108 	return NULL;
109 }
110 
111 static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
112 {
113 	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
114 	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
115 	struct kvm_task_sleep_node *e;
116 
117 	raw_spin_lock(&b->lock);
118 	e = _find_apf_task(b, token);
119 	if (e) {
120 		struct kvm_task_sleep_node *dummy = NULL;
121 
122 		/*
123 		 * The entry can either be a 'dummy' entry (which is put on the
124 		 * list when wake-up happens ahead of APF handling completion)
125 		 * or a token from another task which should not be touched.
126 		 */
127 		if (e->dummy) {
128 			hlist_del(&e->link);
129 			dummy = e;
130 		}
131 
132 		raw_spin_unlock(&b->lock);
133 		kfree(dummy);
134 		return false;
135 	}
136 
137 	n->token = token;
138 	n->cpu = smp_processor_id();
139 	n->dummy = false;
140 	init_swait_queue_head(&n->wq);
141 	hlist_add_head(&n->link, &b->list);
142 	raw_spin_unlock(&b->lock);
143 	return true;
144 }
145 
146 /*
147  * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
148  * @token:	Token to identify the sleep node entry
149  *
150  * Invoked from the async pagefault handling code or from the VM exit page
151  * fault handler. In both cases RCU is watching.
152  */
153 void kvm_async_pf_task_wait_schedule(u32 token)
154 {
155 	struct kvm_task_sleep_node n;
156 	DECLARE_SWAITQUEUE(wait);
157 
158 	lockdep_assert_irqs_disabled();
159 
160 	if (!kvm_async_pf_queue_task(token, &n))
161 		return;
162 
163 	for (;;) {
164 		prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
165 		if (hlist_unhashed(&n.link))
166 			break;
167 
168 		local_irq_enable();
169 		schedule();
170 		local_irq_disable();
171 	}
172 	finish_swait(&n.wq, &wait);
173 }
174 EXPORT_SYMBOL_FOR_KVM(kvm_async_pf_task_wait_schedule);
175 
176 static void apf_task_wake_one(struct kvm_task_sleep_node *n)
177 {
178 	hlist_del_init(&n->link);
179 	if (swq_has_sleeper(&n->wq))
180 		swake_up_one(&n->wq);
181 }
182 
183 static void apf_task_wake_all(void)
184 {
185 	int i;
186 
187 	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
188 		struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
189 		struct kvm_task_sleep_node *n;
190 		struct hlist_node *p, *next;
191 
192 		raw_spin_lock(&b->lock);
193 		hlist_for_each_safe(p, next, &b->list) {
194 			n = hlist_entry(p, typeof(*n), link);
195 			if (n->cpu == smp_processor_id())
196 				apf_task_wake_one(n);
197 		}
198 		raw_spin_unlock(&b->lock);
199 	}
200 }
201 
202 static void kvm_async_pf_task_wake(u32 token)
203 {
204 	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
205 	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
206 	struct kvm_task_sleep_node *n, *dummy = NULL;
207 
208 	if (token == ~0) {
209 		apf_task_wake_all();
210 		return;
211 	}
212 
213 again:
214 	raw_spin_lock(&b->lock);
215 	n = _find_apf_task(b, token);
216 	if (!n) {
217 		/*
218 		 * Async #PF not yet handled, add a dummy entry for the token.
219 		 * Allocating the token must be down outside of the raw lock
220 		 * as the allocator is preemptible on PREEMPT_RT kernels.
221 		 */
222 		if (!dummy) {
223 			raw_spin_unlock(&b->lock);
224 			dummy = kzalloc_obj(*dummy, GFP_ATOMIC);
225 
226 			/*
227 			 * Continue looping on allocation failure, eventually
228 			 * the async #PF will be handled and allocating a new
229 			 * node will be unnecessary.
230 			 */
231 			if (!dummy)
232 				cpu_relax();
233 
234 			/*
235 			 * Recheck for async #PF completion before enqueueing
236 			 * the dummy token to avoid duplicate list entries.
237 			 */
238 			goto again;
239 		}
240 		dummy->token = token;
241 		dummy->cpu = smp_processor_id();
242 		dummy->dummy = true;
243 		init_swait_queue_head(&dummy->wq);
244 		hlist_add_head(&dummy->link, &b->list);
245 		dummy = NULL;
246 	} else {
247 		apf_task_wake_one(n);
248 	}
249 	raw_spin_unlock(&b->lock);
250 
251 	/* A dummy token might be allocated and ultimately not used.  */
252 	kfree(dummy);
253 }
254 
255 noinstr u32 kvm_read_and_reset_apf_flags(void)
256 {
257 	u32 flags = 0;
258 
259 	if (__this_cpu_read(async_pf_enabled)) {
260 		flags = __this_cpu_read(apf_reason.flags);
261 		__this_cpu_write(apf_reason.flags, 0);
262 	}
263 
264 	return flags;
265 }
266 EXPORT_SYMBOL_FOR_KVM(kvm_read_and_reset_apf_flags);
267 
268 noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
269 {
270 	u32 flags = kvm_read_and_reset_apf_flags();
271 	irqentry_state_t state;
272 
273 	if (!flags)
274 		return false;
275 
276 	state = irqentry_enter(regs);
277 	instrumentation_begin();
278 
279 	/*
280 	 * If the host managed to inject an async #PF into an interrupt
281 	 * disabled region, then die hard as this is not going to end well
282 	 * and the host side is seriously broken.
283 	 */
284 	if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
285 		panic("Host injected async #PF in interrupt disabled region\n");
286 
287 	if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
288 		if (unlikely(!(user_mode(regs))))
289 			panic("Host injected async #PF in kernel mode\n");
290 		/* Page is swapped out by the host. */
291 		kvm_async_pf_task_wait_schedule(token);
292 	} else {
293 		WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags);
294 	}
295 
296 	instrumentation_end();
297 	irqentry_exit(regs, state);
298 	return true;
299 }
300 
301 DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
302 {
303 	struct pt_regs *old_regs = set_irq_regs(regs);
304 	u32 token;
305 
306 	apic_eoi();
307 
308 	inc_irq_stat(HYPERVISOR_CALLBACK);
309 
310 	if (__this_cpu_read(async_pf_enabled)) {
311 		token = __this_cpu_read(apf_reason.token);
312 		kvm_async_pf_task_wake(token);
313 		__this_cpu_write(apf_reason.token, 0);
314 		wrmsrq(MSR_KVM_ASYNC_PF_ACK, 1);
315 	}
316 
317 	set_irq_regs(old_regs);
318 }
319 
320 static void __init paravirt_ops_setup(void)
321 {
322 	pv_info.name = "KVM";
323 
324 	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
325 		pv_info.io_delay = false;
326 
327 #ifdef CONFIG_X86_IO_APIC
328 	no_timer_check = 1;
329 #endif
330 }
331 
332 static void kvm_register_steal_time(void)
333 {
334 	int cpu = smp_processor_id();
335 	struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
336 
337 	if (!has_steal_clock)
338 		return;
339 
340 	wrmsrq(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
341 	pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
342 		(unsigned long long) slow_virt_to_phys(st));
343 }
344 
345 static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
346 
347 static notrace __maybe_unused void kvm_guest_apic_eoi_write(void)
348 {
349 	/**
350 	 * This relies on __test_and_clear_bit to modify the memory
351 	 * in a way that is atomic with respect to the local CPU.
352 	 * The hypervisor only accesses this memory from the local CPU so
353 	 * there's no need for lock or memory barriers.
354 	 * An optimization barrier is implied in apic write.
355 	 */
356 	if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
357 		return;
358 	apic_native_eoi();
359 }
360 
361 static void kvm_guest_cpu_init(void)
362 {
363 	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
364 		u64 pa;
365 
366 		WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
367 
368 		pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
369 		pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
370 
371 		if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
372 			pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
373 
374 		wrmsrq(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
375 
376 		wrmsrq(MSR_KVM_ASYNC_PF_EN, pa);
377 		__this_cpu_write(async_pf_enabled, true);
378 		pr_debug("setup async PF for cpu %d\n", smp_processor_id());
379 	}
380 
381 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
382 		unsigned long pa;
383 
384 		/* Size alignment is implied but just to make it explicit. */
385 		BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
386 		__this_cpu_write(kvm_apic_eoi, 0);
387 		pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
388 			| KVM_MSR_ENABLED;
389 		wrmsrq(MSR_KVM_PV_EOI_EN, pa);
390 	}
391 
392 	if (has_steal_clock)
393 		kvm_register_steal_time();
394 }
395 
396 static void kvm_pv_disable_apf(void)
397 {
398 	if (!__this_cpu_read(async_pf_enabled))
399 		return;
400 
401 	wrmsrq(MSR_KVM_ASYNC_PF_EN, 0);
402 	__this_cpu_write(async_pf_enabled, false);
403 
404 	pr_debug("disable async PF for cpu %d\n", smp_processor_id());
405 }
406 
407 static void kvm_disable_steal_time(void)
408 {
409 	if (!has_steal_clock)
410 		return;
411 
412 	wrmsrq(MSR_KVM_STEAL_TIME, 0);
413 }
414 
415 static u64 kvm_steal_clock(int cpu)
416 {
417 	u64 steal;
418 	struct kvm_steal_time *src;
419 	int version;
420 
421 	src = &per_cpu(steal_time, cpu);
422 	do {
423 		version = src->version;
424 		virt_rmb();
425 		steal = src->steal;
426 		virt_rmb();
427 	} while ((version & 1) || (version != src->version));
428 
429 	return steal;
430 }
431 
432 static inline __init void __set_percpu_decrypted(void *ptr, unsigned long size)
433 {
434 	early_set_memory_decrypted((unsigned long) ptr, size);
435 }
436 
437 /*
438  * Iterate through all possible CPUs and map the memory region pointed
439  * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
440  *
441  * Note: we iterate through all possible CPUs to ensure that CPUs
442  * hotplugged will have their per-cpu variable already mapped as
443  * decrypted.
444  */
445 static void __init sev_map_percpu_data(void)
446 {
447 	int cpu;
448 
449 	if (cc_vendor != CC_VENDOR_AMD ||
450 	    !cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
451 		return;
452 
453 	for_each_possible_cpu(cpu) {
454 		__set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
455 		__set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
456 		__set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
457 	}
458 }
459 
460 static void kvm_guest_cpu_offline(bool shutdown)
461 {
462 	kvm_disable_steal_time();
463 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
464 		wrmsrq(MSR_KVM_PV_EOI_EN, 0);
465 	if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
466 		wrmsrq(MSR_KVM_MIGRATION_CONTROL, 0);
467 	kvm_pv_disable_apf();
468 	if (!shutdown)
469 		apf_task_wake_all();
470 	kvmclock_disable();
471 }
472 
473 static int kvm_cpu_online(unsigned int cpu)
474 {
475 	unsigned long flags;
476 
477 	local_irq_save(flags);
478 	kvm_guest_cpu_init();
479 	local_irq_restore(flags);
480 	return 0;
481 }
482 
483 #ifdef CONFIG_SMP
484 
485 static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
486 
487 static bool pv_tlb_flush_supported(void)
488 {
489 	return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
490 		!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
491 		kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
492 		!boot_cpu_has(X86_FEATURE_MWAIT) &&
493 		(num_possible_cpus() != 1));
494 }
495 
496 static bool pv_ipi_supported(void)
497 {
498 	return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) &&
499 	       (num_possible_cpus() != 1));
500 }
501 
502 static bool pv_sched_yield_supported(void)
503 {
504 	return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
505 		!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
506 	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
507 	    !boot_cpu_has(X86_FEATURE_MWAIT) &&
508 	    (num_possible_cpus() != 1));
509 }
510 
511 #define KVM_IPI_CLUSTER_SIZE	(2 * BITS_PER_LONG)
512 
513 static void __send_ipi_mask(const struct cpumask *mask, int vector)
514 {
515 	unsigned long flags;
516 	int cpu, min = 0, max = 0;
517 #ifdef CONFIG_X86_64
518 	__uint128_t ipi_bitmap = 0;
519 #else
520 	u64 ipi_bitmap = 0;
521 #endif
522 	u32 apic_id, icr;
523 	long ret;
524 
525 	if (cpumask_empty(mask))
526 		return;
527 
528 	local_irq_save(flags);
529 
530 	switch (vector) {
531 	default:
532 		icr = APIC_DM_FIXED | vector;
533 		break;
534 	case NMI_VECTOR:
535 		icr = APIC_DM_NMI;
536 		break;
537 	}
538 
539 	for_each_cpu(cpu, mask) {
540 		apic_id = per_cpu(x86_cpu_to_apicid, cpu);
541 		if (!ipi_bitmap) {
542 			min = max = apic_id;
543 		} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
544 			ipi_bitmap <<= min - apic_id;
545 			min = apic_id;
546 		} else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
547 			max = apic_id < max ? max : apic_id;
548 		} else {
549 			ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
550 				(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
551 			WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
552 				  ret);
553 			min = max = apic_id;
554 			ipi_bitmap = 0;
555 		}
556 		__set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
557 	}
558 
559 	if (ipi_bitmap) {
560 		ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
561 			(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
562 		WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
563 			  ret);
564 	}
565 
566 	local_irq_restore(flags);
567 }
568 
569 static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
570 {
571 	__send_ipi_mask(mask, vector);
572 }
573 
574 static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
575 {
576 	unsigned int this_cpu = smp_processor_id();
577 	struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
578 	const struct cpumask *local_mask;
579 
580 	cpumask_copy(new_mask, mask);
581 	cpumask_clear_cpu(this_cpu, new_mask);
582 	local_mask = new_mask;
583 	__send_ipi_mask(local_mask, vector);
584 }
585 
586 static int __init setup_efi_kvm_sev_migration(void)
587 {
588 	efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled";
589 	efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID;
590 	efi_status_t status;
591 	unsigned long size;
592 	bool enabled;
593 
594 	if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) ||
595 	    !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
596 		return 0;
597 
598 	if (!efi_enabled(EFI_BOOT))
599 		return 0;
600 
601 	if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
602 		pr_info("%s : EFI runtime services are not enabled\n", __func__);
603 		return 0;
604 	}
605 
606 	size = sizeof(enabled);
607 
608 	/* Get variable contents into buffer */
609 	status = efi.get_variable(efi_sev_live_migration_enabled,
610 				  &efi_variable_guid, NULL, &size, &enabled);
611 
612 	if (status == EFI_NOT_FOUND) {
613 		pr_info("%s : EFI live migration variable not found\n", __func__);
614 		return 0;
615 	}
616 
617 	if (status != EFI_SUCCESS) {
618 		pr_info("%s : EFI variable retrieval failed\n", __func__);
619 		return 0;
620 	}
621 
622 	if (enabled == 0) {
623 		pr_info("%s: live migration disabled in EFI\n", __func__);
624 		return 0;
625 	}
626 
627 	pr_info("%s : live migration enabled in EFI\n", __func__);
628 	wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);
629 
630 	return 1;
631 }
632 
633 late_initcall(setup_efi_kvm_sev_migration);
634 
635 /*
636  * Set the IPI entry points
637  */
638 static __init void kvm_setup_pv_ipi(void)
639 {
640 	apic_update_callback(send_IPI_mask, kvm_send_ipi_mask);
641 	apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself);
642 	pr_info("setup PV IPIs\n");
643 }
644 
645 static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
646 {
647 	int cpu;
648 
649 	native_send_call_func_ipi(mask);
650 
651 	/* Make sure other vCPUs get a chance to run if they need to. */
652 	for_each_cpu(cpu, mask) {
653 		if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) {
654 			kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
655 			break;
656 		}
657 	}
658 }
659 
660 static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
661 			const struct flush_tlb_info *info)
662 {
663 	u8 state;
664 	int cpu;
665 	struct kvm_steal_time *src;
666 	struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
667 
668 	cpumask_copy(flushmask, cpumask);
669 	/*
670 	 * We have to call flush only on online vCPUs. And
671 	 * queue flush_on_enter for pre-empted vCPUs
672 	 */
673 	for_each_cpu(cpu, flushmask) {
674 		/*
675 		 * The local vCPU is never preempted, so we do not explicitly
676 		 * skip check for local vCPU - it will never be cleared from
677 		 * flushmask.
678 		 */
679 		src = &per_cpu(steal_time, cpu);
680 		state = READ_ONCE(src->preempted);
681 		if ((state & KVM_VCPU_PREEMPTED)) {
682 			if (try_cmpxchg(&src->preempted, &state,
683 					state | KVM_VCPU_FLUSH_TLB))
684 				__cpumask_clear_cpu(cpu, flushmask);
685 		}
686 	}
687 
688 	native_flush_tlb_multi(flushmask, info);
689 }
690 
691 static __init int kvm_alloc_cpumask(void)
692 {
693 	int cpu;
694 
695 	if (!kvm_para_available() || nopv)
696 		return 0;
697 
698 	if (pv_tlb_flush_supported() || pv_ipi_supported())
699 		for_each_possible_cpu(cpu) {
700 			zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
701 				GFP_KERNEL, cpu_to_node(cpu));
702 		}
703 
704 	return 0;
705 }
706 arch_initcall(kvm_alloc_cpumask);
707 
708 static void __init kvm_smp_prepare_boot_cpu(void)
709 {
710 	/*
711 	 * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
712 	 * shares the guest physical address with the hypervisor.
713 	 */
714 	sev_map_percpu_data();
715 
716 	kvm_guest_cpu_init();
717 	native_smp_prepare_boot_cpu();
718 	kvm_spinlock_init();
719 }
720 
721 static int kvm_cpu_down_prepare(unsigned int cpu)
722 {
723 	unsigned long flags;
724 
725 	local_irq_save(flags);
726 	kvm_guest_cpu_offline(false);
727 	local_irq_restore(flags);
728 	return 0;
729 }
730 
731 #endif
732 
733 static int kvm_suspend(void *data)
734 {
735 	u64 val = 0;
736 
737 	kvm_guest_cpu_offline(false);
738 
739 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
740 	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
741 		rdmsrq(MSR_KVM_POLL_CONTROL, val);
742 	has_guest_poll = !(val & 1);
743 #endif
744 	return 0;
745 }
746 
747 static void kvm_resume(void *data)
748 {
749 	kvm_cpu_online(raw_smp_processor_id());
750 
751 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
752 	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
753 		wrmsrq(MSR_KVM_POLL_CONTROL, 0);
754 #endif
755 }
756 
757 static const struct syscore_ops kvm_syscore_ops = {
758 	.suspend	= kvm_suspend,
759 	.resume		= kvm_resume,
760 };
761 
762 static struct syscore kvm_syscore = {
763 	.ops = &kvm_syscore_ops,
764 };
765 
766 static void kvm_pv_guest_cpu_reboot(void *unused)
767 {
768 	kvm_guest_cpu_offline(true);
769 }
770 
771 static int kvm_pv_reboot_notify(struct notifier_block *nb,
772 				unsigned long code, void *unused)
773 {
774 	if (code == SYS_RESTART)
775 		on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
776 	return NOTIFY_DONE;
777 }
778 
779 static struct notifier_block kvm_pv_reboot_nb = {
780 	.notifier_call = kvm_pv_reboot_notify,
781 };
782 
783 /*
784  * After a PV feature is registered, the host will keep writing to the
785  * registered memory location. If the guest happens to shutdown, this memory
786  * won't be valid. In cases like kexec, in which you install a new kernel, this
787  * means a random memory location will be kept being written.
788  */
789 #ifdef CONFIG_CRASH_DUMP
790 static void kvm_crash_shutdown(struct pt_regs *regs)
791 {
792 	kvm_guest_cpu_offline(true);
793 	native_machine_crash_shutdown(regs);
794 }
795 #endif
796 
797 #if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
798 bool __kvm_vcpu_is_preempted(long cpu);
799 
800 __visible bool __kvm_vcpu_is_preempted(long cpu)
801 {
802 	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
803 
804 	return !!(src->preempted & KVM_VCPU_PREEMPTED);
805 }
806 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
807 
808 #else
809 
810 #include <asm/asm-offsets.h>
811 
812 extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
813 
814 /*
815  * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
816  * restoring to/from the stack.
817  */
818 #define PV_VCPU_PREEMPTED_ASM						     \
819  "movq   __per_cpu_offset(,%rdi,8), %rax\n\t"				     \
820  "cmpb   $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
821  "setne  %al\n\t"
822 
823 DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted,
824 		PV_VCPU_PREEMPTED_ASM, .text);
825 #endif
826 
827 static void __init kvm_guest_init(void)
828 {
829 	int i;
830 
831 	paravirt_ops_setup();
832 	register_reboot_notifier(&kvm_pv_reboot_nb);
833 	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
834 		raw_spin_lock_init(&async_pf_sleepers[i].lock);
835 
836 	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
837 		has_steal_clock = 1;
838 		static_call_update(pv_steal_clock, kvm_steal_clock);
839 
840 #ifdef CONFIG_PARAVIRT_SPINLOCKS
841 		pv_ops_lock.vcpu_is_preempted =
842 			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
843 #endif
844 	}
845 
846 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
847 		apic_update_callback(eoi, kvm_guest_apic_eoi_write);
848 
849 	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
850 		static_branch_enable(&kvm_async_pf_enabled);
851 		sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
852 	}
853 
854 #ifdef CONFIG_SMP
855 	if (pv_tlb_flush_supported()) {
856 		pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
857 		pr_info("KVM setup pv remote TLB flush\n");
858 	}
859 
860 	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
861 	if (pv_sched_yield_supported()) {
862 		smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
863 		pr_info("setup PV sched yield\n");
864 	}
865 	if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
866 				      kvm_cpu_online, kvm_cpu_down_prepare) < 0)
867 		pr_err("failed to install cpu hotplug callbacks\n");
868 #else
869 	sev_map_percpu_data();
870 	kvm_guest_cpu_init();
871 #endif
872 
873 #ifdef CONFIG_CRASH_DUMP
874 	machine_ops.crash_shutdown = kvm_crash_shutdown;
875 #endif
876 
877 	register_syscore(&kvm_syscore);
878 
879 	/*
880 	 * Hard lockup detection is enabled by default. Disable it, as guests
881 	 * can get false positives too easily, for example if the host is
882 	 * overcommitted.
883 	 */
884 	hardlockup_detector_disable();
885 }
886 
887 static noinline uint32_t __kvm_cpuid_base(void)
888 {
889 	if (boot_cpu_data.cpuid_level < 0)
890 		return 0;	/* So we don't blow up on old processors */
891 
892 	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
893 		return cpuid_base_hypervisor(KVM_SIGNATURE, 0);
894 
895 	return 0;
896 }
897 
898 static inline uint32_t kvm_cpuid_base(void)
899 {
900 	static int kvm_cpuid_base = -1;
901 
902 	if (kvm_cpuid_base == -1)
903 		kvm_cpuid_base = __kvm_cpuid_base();
904 
905 	return kvm_cpuid_base;
906 }
907 
908 bool kvm_para_available(void)
909 {
910 	return kvm_cpuid_base() != 0;
911 }
912 EXPORT_SYMBOL_GPL(kvm_para_available);
913 
914 unsigned int kvm_arch_para_features(void)
915 {
916 	return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
917 }
918 
919 unsigned int kvm_arch_para_hints(void)
920 {
921 	return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
922 }
923 EXPORT_SYMBOL_GPL(kvm_arch_para_hints);
924 
925 static uint32_t __init kvm_detect(void)
926 {
927 	return kvm_cpuid_base();
928 }
929 
930 static void __init kvm_apic_init(void)
931 {
932 #ifdef CONFIG_SMP
933 	if (pv_ipi_supported())
934 		kvm_setup_pv_ipi();
935 #endif
936 }
937 
938 static bool __init kvm_msi_ext_dest_id(void)
939 {
940 	return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
941 }
942 
943 static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc)
944 {
945 	kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages,
946 			   KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
947 }
948 
949 static void __init kvm_init_platform(void)
950 {
951 	u64 tolud = PFN_PHYS(e820__end_of_low_ram_pfn());
952 	/*
953 	 * Note, hardware requires variable MTRR ranges to be power-of-2 sized
954 	 * and naturally aligned.  But when forcing guest MTRR state, Linux
955 	 * doesn't program the forced ranges into hardware.  Don't bother doing
956 	 * the math to generate a technically-legal range.
957 	 */
958 	struct mtrr_var_range pci_hole = {
959 		.base_lo = tolud | X86_MEMTYPE_UC,
960 		.mask_lo = (u32)(~(SZ_4G - tolud - 1)) | MTRR_PHYSMASK_V,
961 		.mask_hi = (BIT_ULL(boot_cpu_data.x86_phys_bits) - 1) >> 32,
962 	};
963 
964 	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
965 	    kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
966 		unsigned long nr_pages;
967 		int i;
968 
969 		pv_ops.mmu.notify_page_enc_status_changed =
970 			kvm_sev_hc_page_enc_status;
971 
972 		/*
973 		 * Reset the host's shared pages list related to kernel
974 		 * specific page encryption status settings before we load a
975 		 * new kernel by kexec. Reset the page encryption status
976 		 * during early boot instead of just before kexec to avoid SMP
977 		 * races during kvm_pv_guest_cpu_reboot().
978 		 * NOTE: We cannot reset the complete shared pages list
979 		 * here as we need to retain the UEFI/OVMF firmware
980 		 * specific settings.
981 		 */
982 
983 		for (i = 0; i < e820_table->nr_entries; i++) {
984 			struct e820_entry *entry = &e820_table->entries[i];
985 
986 			if (entry->type != E820_TYPE_RAM)
987 				continue;
988 
989 			nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
990 
991 			kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr,
992 				       nr_pages,
993 				       KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
994 		}
995 
996 		/*
997 		 * Ensure that _bss_decrypted section is marked as decrypted in the
998 		 * shared pages list.
999 		 */
1000 		early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted,
1001 						__end_bss_decrypted - __start_bss_decrypted, 0);
1002 
1003 		/*
1004 		 * If not booted using EFI, enable Live migration support.
1005 		 */
1006 		if (!efi_enabled(EFI_BOOT))
1007 			wrmsrq(MSR_KVM_MIGRATION_CONTROL,
1008 			       KVM_MIGRATION_READY);
1009 	}
1010 	kvmclock_init();
1011 	x86_platform.apic_post_init = kvm_apic_init;
1012 
1013 	/*
1014 	 * Set WB as the default cache mode for SEV-SNP and TDX, with a single
1015 	 * UC range for the legacy PCI hole, e.g. so that devices that expect
1016 	 * to get UC/WC mappings don't get surprised with WB.
1017 	 */
1018 	guest_force_mtrr_state(&pci_hole, 1, MTRR_TYPE_WRBACK);
1019 }
1020 
1021 #if defined(CONFIG_AMD_MEM_ENCRYPT)
1022 static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
1023 {
1024 	/* RAX and CPL are already in the GHCB */
1025 	ghcb_set_rbx(ghcb, regs->bx);
1026 	ghcb_set_rcx(ghcb, regs->cx);
1027 	ghcb_set_rdx(ghcb, regs->dx);
1028 	ghcb_set_rsi(ghcb, regs->si);
1029 }
1030 
1031 static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
1032 {
1033 	/* No checking of the return state needed */
1034 	return true;
1035 }
1036 #endif
1037 
1038 const __initconst struct hypervisor_x86 x86_hyper_kvm = {
1039 	.name				= "KVM",
1040 	.detect				= kvm_detect,
1041 	.type				= X86_HYPER_KVM,
1042 	.init.guest_late_init		= kvm_guest_init,
1043 	.init.x2apic_available		= kvm_para_available,
1044 	.init.msi_ext_dest_id		= kvm_msi_ext_dest_id,
1045 	.init.init_platform		= kvm_init_platform,
1046 #if defined(CONFIG_AMD_MEM_ENCRYPT)
1047 	.runtime.sev_es_hcall_prepare	= kvm_sev_es_hcall_prepare,
1048 	.runtime.sev_es_hcall_finish	= kvm_sev_es_hcall_finish,
1049 #endif
1050 };
1051 
1052 static __init int activate_jump_labels(void)
1053 {
1054 	if (has_steal_clock) {
1055 		static_key_slow_inc(&paravirt_steal_enabled);
1056 		if (steal_acc)
1057 			static_key_slow_inc(&paravirt_steal_rq_enabled);
1058 	}
1059 
1060 	return 0;
1061 }
1062 arch_initcall(activate_jump_labels);
1063 
1064 #ifdef CONFIG_PARAVIRT_SPINLOCKS
1065 
1066 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
1067 static void kvm_kick_cpu(int cpu)
1068 {
1069 	unsigned long flags = 0;
1070 	u32 apicid;
1071 
1072 	apicid = per_cpu(x86_cpu_to_apicid, cpu);
1073 	kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
1074 }
1075 
1076 #include <asm/qspinlock.h>
1077 
1078 static void kvm_wait(u8 *ptr, u8 val)
1079 {
1080 	if (in_nmi())
1081 		return;
1082 
1083 	/*
1084 	 * halt until it's our turn and kicked. Note that we do safe halt
1085 	 * for irq enabled case to avoid hang when lock info is overwritten
1086 	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
1087 	 */
1088 	if (irqs_disabled()) {
1089 		if (READ_ONCE(*ptr) == val)
1090 			halt();
1091 	} else {
1092 		local_irq_disable();
1093 
1094 		/* safe_halt() will enable IRQ */
1095 		if (READ_ONCE(*ptr) == val)
1096 			safe_halt();
1097 		else
1098 			local_irq_enable();
1099 	}
1100 }
1101 
1102 /*
1103  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
1104  */
1105 void __init kvm_spinlock_init(void)
1106 {
1107 	/*
1108 	 * Disable PV spinlocks and use native qspinlock when dedicated pCPUs
1109 	 * are available.
1110 	 */
1111 	if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
1112 		pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
1113 		goto out;
1114 	}
1115 
1116 	if (num_possible_cpus() == 1) {
1117 		pr_info("PV spinlocks disabled, single CPU\n");
1118 		goto out;
1119 	}
1120 
1121 	if (nopvspin) {
1122 		pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
1123 		goto out;
1124 	}
1125 
1126 	/*
1127 	 * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
1128 	 * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
1129 	 * preferred over native qspinlock when vCPU is preempted.
1130 	 */
1131 	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
1132 		pr_info("PV spinlocks disabled, no host support\n");
1133 		return;
1134 	}
1135 
1136 	pr_info("PV spinlocks enabled\n");
1137 
1138 	__pv_init_lock_hash();
1139 	pv_ops_lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
1140 	pv_ops_lock.queued_spin_unlock =
1141 		PV_CALLEE_SAVE(__pv_queued_spin_unlock);
1142 	pv_ops_lock.wait = kvm_wait;
1143 	pv_ops_lock.kick = kvm_kick_cpu;
1144 
1145 	/*
1146 	 * When PV spinlock is enabled which is preferred over
1147 	 * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
1148 	 * Just disable it anyway.
1149 	 */
1150 out:
1151 	static_branch_disable(&virt_spin_lock_key);
1152 }
1153 
1154 #endif	/* CONFIG_PARAVIRT_SPINLOCKS */
1155 
1156 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
1157 
1158 static void kvm_disable_host_haltpoll(void *i)
1159 {
1160 	wrmsrq(MSR_KVM_POLL_CONTROL, 0);
1161 }
1162 
1163 static void kvm_enable_host_haltpoll(void *i)
1164 {
1165 	wrmsrq(MSR_KVM_POLL_CONTROL, 1);
1166 }
1167 
1168 void arch_haltpoll_enable(unsigned int cpu)
1169 {
1170 	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
1171 		pr_err_once("host does not support poll control\n");
1172 		pr_err_once("host upgrade recommended\n");
1173 		return;
1174 	}
1175 
1176 	/* Enable guest halt poll disables host halt poll */
1177 	smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1);
1178 }
1179 EXPORT_SYMBOL_GPL(arch_haltpoll_enable);
1180 
1181 void arch_haltpoll_disable(unsigned int cpu)
1182 {
1183 	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
1184 		return;
1185 
1186 	/* Disable guest halt poll enables host halt poll */
1187 	smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
1188 }
1189 EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
1190 #endif
1191