xref: /linux/arch/x86/kernel/cpu/vmware.c (revision 4b99990cdf9560e8a071640baf19f312e6ae02f4)
1 /*
2  * VMware Detection code.
3  *
4  * Copyright (C) 2008, VMware, Inc.
5  * Author : Alok N Kataria <akataria@vmware.com>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
15  * NON INFRINGEMENT.  See the GNU General Public License for more
16  * details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  */
23 
24 #include <linux/dmi.h>
25 #include <linux/init.h>
26 #include <linux/export.h>
27 #include <linux/clocksource.h>
28 #include <linux/cpu.h>
29 #include <linux/efi.h>
30 #include <linux/reboot.h>
31 #include <linux/static_call.h>
32 #include <linux/sched/cputime.h>
33 #include <asm/div64.h>
34 #include <asm/x86_init.h>
35 #include <asm/hypervisor.h>
36 #include <asm/cpuid/api.h>
37 #include <asm/timer.h>
38 #include <asm/apic.h>
39 #include <asm/vmware.h>
40 #include <asm/svm.h>
41 
42 #undef pr_fmt
43 #define pr_fmt(fmt)	"vmware: " fmt
44 
45 #define CPUID_VMWARE_INFO_LEAF               0x40000000
46 #define CPUID_VMWARE_FEATURES_LEAF           0x40000010
47 
48 #define GETVCPU_INFO_LEGACY_X2APIC           BIT(3)
49 #define GETVCPU_INFO_VCPU_RESERVED           BIT(31)
50 
51 #define STEALCLOCK_NOT_AVAILABLE (-1)
52 #define STEALCLOCK_DISABLED        0
53 #define STEALCLOCK_ENABLED         1
54 
55 struct vmware_steal_time {
56 	union {
57 		u64 clock;	/* stolen time counter in units of vtsc */
58 		struct {
59 			/* only for little-endian */
60 			u32 clock_low;
61 			u32 clock_high;
62 		};
63 	};
64 	u64 reserved[7];
65 };
66 
67 static unsigned long vmware_tsc_khz __ro_after_init;
68 static u8 vmware_hypercall_mode     __ro_after_init;
69 
70 unsigned long vmware_hypercall_slow(unsigned long cmd,
71 				    unsigned long in1, unsigned long in3,
72 				    unsigned long in4, unsigned long in5,
73 				    u32 *out1, u32 *out2, u32 *out3,
74 				    u32 *out4, u32 *out5)
75 {
76 	unsigned long out0, rbx, rcx, rdx, rsi, rdi;
77 
78 	switch (vmware_hypercall_mode) {
79 	case CPUID_VMWARE_FEATURES_ECX_VMCALL:
80 		asm_inline volatile ("vmcall"
81 				: "=a" (out0), "=b" (rbx), "=c" (rcx),
82 				"=d" (rdx), "=S" (rsi), "=D" (rdi)
83 				: "a" (VMWARE_HYPERVISOR_MAGIC),
84 				"b" (in1),
85 				"c" (cmd),
86 				"d" (in3),
87 				"S" (in4),
88 				"D" (in5)
89 				: "cc", "memory");
90 		break;
91 	case CPUID_VMWARE_FEATURES_ECX_VMMCALL:
92 		asm_inline volatile ("vmmcall"
93 				: "=a" (out0), "=b" (rbx), "=c" (rcx),
94 				"=d" (rdx), "=S" (rsi), "=D" (rdi)
95 				: "a" (VMWARE_HYPERVISOR_MAGIC),
96 				"b" (in1),
97 				"c" (cmd),
98 				"d" (in3),
99 				"S" (in4),
100 				"D" (in5)
101 				: "cc", "memory");
102 		break;
103 	default:
104 		asm_inline volatile ("movw %[port], %%dx; inl (%%dx), %%eax"
105 				: "=a" (out0), "=b" (rbx), "=c" (rcx),
106 				"=d" (rdx), "=S" (rsi), "=D" (rdi)
107 				: [port] "i" (VMWARE_HYPERVISOR_PORT),
108 				"a" (VMWARE_HYPERVISOR_MAGIC),
109 				"b" (in1),
110 				"c" (cmd),
111 				"d" (in3),
112 				"S" (in4),
113 				"D" (in5)
114 				: "cc", "memory");
115 		break;
116 	}
117 
118 	if (out1)
119 		*out1 = rbx;
120 	if (out2)
121 		*out2 = rcx;
122 	if (out3)
123 		*out3 = rdx;
124 	if (out4)
125 		*out4 = rsi;
126 	if (out5)
127 		*out5 = rdi;
128 
129 	return out0;
130 }
131 
132 static inline int __vmware_platform(void)
133 {
134 	u32 eax, ebx, ecx;
135 
136 	eax = vmware_hypercall3(VMWARE_CMD_GETVERSION, 0, &ebx, &ecx);
137 	return eax != UINT_MAX && ebx == VMWARE_HYPERVISOR_MAGIC;
138 }
139 
140 static unsigned long vmware_get_tsc_khz(void)
141 {
142 	return vmware_tsc_khz;
143 }
144 
145 #ifdef CONFIG_PARAVIRT
146 static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
147 static bool vmw_sched_clock __initdata = true;
148 static DEFINE_PER_CPU_DECRYPTED(struct vmware_steal_time, vmw_steal_time) __aligned(64);
149 static bool has_steal_clock;
150 static bool steal_acc __initdata = true; /* steal time accounting */
151 
152 static __init int setup_vmw_sched_clock(char *s)
153 {
154 	vmw_sched_clock = false;
155 	return 0;
156 }
157 early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
158 
159 static __init int parse_no_stealacc(char *arg)
160 {
161 	steal_acc = false;
162 	return 0;
163 }
164 early_param("no-steal-acc", parse_no_stealacc);
165 
166 static noinstr u64 vmware_sched_clock(void)
167 {
168 	unsigned long long ns;
169 
170 	ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul,
171 			     vmware_cyc2ns.cyc2ns_shift);
172 	ns -= vmware_cyc2ns.cyc2ns_offset;
173 	return ns;
174 }
175 
176 static void __init vmware_cyc2ns_setup(void)
177 {
178 	struct cyc2ns_data *d = &vmware_cyc2ns;
179 	unsigned long long tsc_now = rdtsc();
180 
181 	clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift,
182 			       vmware_tsc_khz, NSEC_PER_MSEC, 0);
183 	d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul,
184 					   d->cyc2ns_shift);
185 
186 	pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset);
187 }
188 
189 static int vmware_cmd_stealclock(u32 addr_hi, u32 addr_lo)
190 {
191 	u32 info;
192 
193 	return vmware_hypercall5(VMWARE_CMD_STEALCLOCK, 0, 0, addr_hi, addr_lo,
194 				 &info);
195 }
196 
197 static bool stealclock_enable(phys_addr_t pa)
198 {
199 	return vmware_cmd_stealclock(upper_32_bits(pa),
200 				     lower_32_bits(pa)) == STEALCLOCK_ENABLED;
201 }
202 
203 static int __stealclock_disable(void)
204 {
205 	return vmware_cmd_stealclock(0, 1);
206 }
207 
208 static void stealclock_disable(void)
209 {
210 	__stealclock_disable();
211 }
212 
213 static bool vmware_is_stealclock_available(void)
214 {
215 	return __stealclock_disable() != STEALCLOCK_NOT_AVAILABLE;
216 }
217 
218 /**
219  * vmware_steal_clock() - read the per-cpu steal clock
220  * @cpu:            the cpu number whose steal clock we want to read
221  *
222  * The function reads the steal clock if we are on a 64-bit system, otherwise
223  * reads it in parts, checking that the high part didn't change in the
224  * meantime.
225  *
226  * Return:
227  *      The steal clock reading in ns.
228  */
229 static u64 vmware_steal_clock(int cpu)
230 {
231 	struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu);
232 	u64 clock;
233 
234 	if (IS_ENABLED(CONFIG_64BIT))
235 		clock = READ_ONCE(steal->clock);
236 	else {
237 		u32 initial_high, low, high;
238 
239 		do {
240 			initial_high = READ_ONCE(steal->clock_high);
241 			/* Do not reorder initial_high and high readings */
242 			virt_rmb();
243 			low = READ_ONCE(steal->clock_low);
244 			/* Keep low reading in between */
245 			virt_rmb();
246 			high = READ_ONCE(steal->clock_high);
247 		} while (initial_high != high);
248 
249 		clock = ((u64)high << 32) | low;
250 	}
251 
252 	return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul,
253 			     vmware_cyc2ns.cyc2ns_shift);
254 }
255 
256 static void vmware_register_steal_time(void)
257 {
258 	int cpu = smp_processor_id();
259 	struct vmware_steal_time *st = &per_cpu(vmw_steal_time, cpu);
260 
261 	if (!has_steal_clock)
262 		return;
263 
264 	if (!stealclock_enable(slow_virt_to_phys(st))) {
265 		has_steal_clock = false;
266 		return;
267 	}
268 
269 	pr_info("vmware-stealtime: cpu %d, pa %llx\n",
270 		cpu, (unsigned long long) slow_virt_to_phys(st));
271 }
272 
273 static void vmware_disable_steal_time(void)
274 {
275 	if (!has_steal_clock)
276 		return;
277 
278 	stealclock_disable();
279 }
280 
281 static void vmware_guest_cpu_init(void)
282 {
283 	if (has_steal_clock)
284 		vmware_register_steal_time();
285 }
286 
287 static void vmware_pv_guest_cpu_reboot(void *unused)
288 {
289 	vmware_disable_steal_time();
290 }
291 
292 static int vmware_pv_reboot_notify(struct notifier_block *nb,
293 				unsigned long code, void *unused)
294 {
295 	if (code == SYS_RESTART)
296 		on_each_cpu(vmware_pv_guest_cpu_reboot, NULL, 1);
297 	return NOTIFY_DONE;
298 }
299 
300 static struct notifier_block vmware_pv_reboot_nb = {
301 	.notifier_call = vmware_pv_reboot_notify,
302 };
303 
304 #ifdef CONFIG_SMP
305 static void __init vmware_smp_prepare_boot_cpu(void)
306 {
307 	vmware_guest_cpu_init();
308 	native_smp_prepare_boot_cpu();
309 }
310 
311 static int vmware_cpu_online(unsigned int cpu)
312 {
313 	local_irq_disable();
314 	vmware_guest_cpu_init();
315 	local_irq_enable();
316 	return 0;
317 }
318 
319 static int vmware_cpu_down_prepare(unsigned int cpu)
320 {
321 	local_irq_disable();
322 	vmware_disable_steal_time();
323 	local_irq_enable();
324 	return 0;
325 }
326 #endif
327 
328 static __init int activate_jump_labels(void)
329 {
330 	if (has_steal_clock) {
331 		static_key_slow_inc(&paravirt_steal_enabled);
332 		if (steal_acc)
333 			static_key_slow_inc(&paravirt_steal_rq_enabled);
334 	}
335 
336 	return 0;
337 }
338 arch_initcall(activate_jump_labels);
339 
340 static void __init vmware_paravirt_ops_setup(void)
341 {
342 	pv_info.name = "VMware hypervisor";
343 	pv_info.io_delay = false;
344 
345 	if (vmware_tsc_khz == 0)
346 		return;
347 
348 	vmware_cyc2ns_setup();
349 
350 	if (vmw_sched_clock)
351 		paravirt_set_sched_clock(vmware_sched_clock);
352 
353 	if (vmware_is_stealclock_available()) {
354 		has_steal_clock = true;
355 		static_call_update(pv_steal_clock, vmware_steal_clock);
356 
357 		/* We use reboot notifier only to disable steal clock */
358 		register_reboot_notifier(&vmware_pv_reboot_nb);
359 
360 #ifdef CONFIG_SMP
361 		smp_ops.smp_prepare_boot_cpu =
362 			vmware_smp_prepare_boot_cpu;
363 		if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
364 					      "x86/vmware:online",
365 					      vmware_cpu_online,
366 					      vmware_cpu_down_prepare) < 0)
367 			pr_err("vmware_guest: Failed to install cpu hotplug callbacks\n");
368 #else
369 		vmware_guest_cpu_init();
370 #endif
371 	}
372 }
373 #else
374 #define vmware_paravirt_ops_setup() do {} while (0)
375 #endif
376 
377 /*
378  * VMware hypervisor takes care of exporting a reliable TSC to the guest.
379  * Still, due to timing difference when running on virtual cpus, the TSC can
380  * be marked as unstable in some cases. For example, the TSC sync check at
381  * bootup can fail due to a marginal offset between vcpus' TSCs (though the
382  * TSCs do not drift from each other).  Also, the ACPI PM timer clocksource
383  * is not suitable as a watchdog when running on a hypervisor because the
384  * kernel may miss a wrap of the counter if the vcpu is descheduled for a
385  * long time. To skip these checks at runtime we set these capability bits,
386  * so that the kernel could just trust the hypervisor with providing a
387  * reliable virtual TSC that is suitable for timekeeping.
388  */
389 static void __init vmware_set_capabilities(void)
390 {
391 	setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC);
392 	setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
393 	if (vmware_tsc_khz)
394 		setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
395 	if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL)
396 		setup_force_cpu_cap(X86_FEATURE_VMCALL);
397 	else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL)
398 		setup_force_cpu_cap(X86_FEATURE_VMW_VMMCALL);
399 }
400 
401 static void __init vmware_platform_setup(void)
402 {
403 	u32 eax, ebx, ecx;
404 	u64 lpj, tsc_khz;
405 
406 	eax = vmware_hypercall3(VMWARE_CMD_GETHZ, UINT_MAX, &ebx, &ecx);
407 
408 	if (ebx != UINT_MAX) {
409 		lpj = tsc_khz = eax | (((u64)ebx) << 32);
410 		do_div(tsc_khz, 1000);
411 		WARN_ON(tsc_khz >> 32);
412 		pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
413 			(unsigned long) tsc_khz / 1000,
414 			(unsigned long) tsc_khz % 1000);
415 
416 		if (!preset_lpj) {
417 			do_div(lpj, HZ);
418 			preset_lpj = lpj;
419 		}
420 
421 		vmware_tsc_khz = tsc_khz;
422 		x86_platform.calibrate_tsc = vmware_get_tsc_khz;
423 		x86_platform.calibrate_cpu = vmware_get_tsc_khz;
424 
425 #ifdef CONFIG_X86_LOCAL_APIC
426 		/* Skip lapic calibration since we know the bus frequency. */
427 		lapic_timer_period = ecx / HZ;
428 		pr_info("Host bus clock speed read from hypervisor : %u Hz\n",
429 			ecx);
430 #endif
431 	} else {
432 		pr_warn("Failed to get TSC freq from the hypervisor\n");
433 	}
434 
435 	if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !efi_enabled(EFI_BOOT))
436 		x86_init.mpparse.find_mptable = mpparse_find_mptable;
437 
438 	vmware_paravirt_ops_setup();
439 
440 #ifdef CONFIG_X86_IO_APIC
441 	no_timer_check = 1;
442 #endif
443 
444 	vmware_set_capabilities();
445 }
446 
447 static u8 __init vmware_select_hypercall(void)
448 {
449 	int eax, ebx, ecx, edx;
450 
451 	cpuid(CPUID_VMWARE_FEATURES_LEAF, &eax, &ebx, &ecx, &edx);
452 	return (ecx & (CPUID_VMWARE_FEATURES_ECX_VMMCALL |
453 		       CPUID_VMWARE_FEATURES_ECX_VMCALL));
454 }
455 
456 /*
457  * While checking the dmi string information, just checking the product
458  * serial key should be enough, as this will always have a VMware
459  * specific string when running under VMware hypervisor.
460  * If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode
461  * intentionally defaults to 0.
462  */
463 static u32 __init vmware_platform(void)
464 {
465 	if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
466 		unsigned int eax;
467 		unsigned int hyper_vendor_id[3];
468 
469 		cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
470 		      &hyper_vendor_id[1], &hyper_vendor_id[2]);
471 		if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) {
472 			if (eax >= CPUID_VMWARE_FEATURES_LEAF)
473 				vmware_hypercall_mode =
474 					vmware_select_hypercall();
475 
476 			pr_info("hypercall mode: 0x%02x\n",
477 				(unsigned int) vmware_hypercall_mode);
478 
479 			return CPUID_VMWARE_INFO_LEAF;
480 		}
481 	} else if (dmi_available && dmi_name_in_serial("VMware") &&
482 		   __vmware_platform())
483 		return 1;
484 
485 	return 0;
486 }
487 
488 /* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
489 static bool __init vmware_legacy_x2apic_available(void)
490 {
491 	u32 eax;
492 
493 	eax = vmware_hypercall1(VMWARE_CMD_GETVCPU_INFO, 0);
494 	return !(eax & GETVCPU_INFO_VCPU_RESERVED) &&
495 		(eax & GETVCPU_INFO_LEGACY_X2APIC);
496 }
497 
498 #ifdef CONFIG_INTEL_TDX_GUEST
499 /*
500  * TDCALL[TDG.VP.VMCALL] uses %rax (arg0) and %rcx (arg2). Therefore,
501  * we remap those registers to %r12 and %r13, respectively.
502  */
503 unsigned long vmware_tdx_hypercall(unsigned long cmd,
504 				   unsigned long in1, unsigned long in3,
505 				   unsigned long in4, unsigned long in5,
506 				   u32 *out1, u32 *out2, u32 *out3,
507 				   u32 *out4, u32 *out5)
508 {
509 	struct tdx_module_args args = {};
510 
511 	if (!hypervisor_is_type(X86_HYPER_VMWARE)) {
512 		pr_warn_once("Incorrect usage\n");
513 		return ULONG_MAX;
514 	}
515 
516 	if (cmd & ~VMWARE_CMD_MASK) {
517 		pr_warn_once("Out of range command %lx\n", cmd);
518 		return ULONG_MAX;
519 	}
520 
521 	args.rbx = in1;
522 	args.rdx = in3;
523 	args.rsi = in4;
524 	args.rdi = in5;
525 	args.r10 = VMWARE_TDX_VENDOR_LEAF;
526 	args.r11 = VMWARE_TDX_HCALL_FUNC;
527 	args.r12 = VMWARE_HYPERVISOR_MAGIC;
528 	args.r13 = cmd;
529 	/* CPL */
530 	args.r15 = 0;
531 
532 	__tdx_hypercall(&args);
533 
534 	if (out1)
535 		*out1 = args.rbx;
536 	if (out2)
537 		*out2 = args.r13;
538 	if (out3)
539 		*out3 = args.rdx;
540 	if (out4)
541 		*out4 = args.rsi;
542 	if (out5)
543 		*out5 = args.rdi;
544 
545 	return args.r12;
546 }
547 EXPORT_SYMBOL_GPL(vmware_tdx_hypercall);
548 #endif
549 
550 #ifdef CONFIG_AMD_MEM_ENCRYPT
551 static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb,
552 					struct pt_regs *regs)
553 {
554 	/* Copy VMWARE specific Hypercall parameters to the GHCB */
555 	ghcb_set_rip(ghcb, regs->ip);
556 	ghcb_set_rbx(ghcb, regs->bx);
557 	ghcb_set_rcx(ghcb, regs->cx);
558 	ghcb_set_rdx(ghcb, regs->dx);
559 	ghcb_set_rsi(ghcb, regs->si);
560 	ghcb_set_rdi(ghcb, regs->di);
561 	ghcb_set_rbp(ghcb, regs->bp);
562 }
563 
564 static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
565 {
566 	if (!(ghcb_rbx_is_valid(ghcb) &&
567 	      ghcb_rcx_is_valid(ghcb) &&
568 	      ghcb_rdx_is_valid(ghcb) &&
569 	      ghcb_rsi_is_valid(ghcb) &&
570 	      ghcb_rdi_is_valid(ghcb) &&
571 	      ghcb_rbp_is_valid(ghcb)))
572 		return false;
573 
574 	regs->bx = ghcb_get_rbx(ghcb);
575 	regs->cx = ghcb_get_rcx(ghcb);
576 	regs->dx = ghcb_get_rdx(ghcb);
577 	regs->si = ghcb_get_rsi(ghcb);
578 	regs->di = ghcb_get_rdi(ghcb);
579 	regs->bp = ghcb_get_rbp(ghcb);
580 
581 	return true;
582 }
583 #endif
584 
585 const __initconst struct hypervisor_x86 x86_hyper_vmware = {
586 	.name				= "VMware",
587 	.detect				= vmware_platform,
588 	.type				= X86_HYPER_VMWARE,
589 	.init.init_platform		= vmware_platform_setup,
590 	.init.x2apic_available		= vmware_legacy_x2apic_available,
591 #ifdef CONFIG_AMD_MEM_ENCRYPT
592 	.runtime.sev_es_hcall_prepare	= vmware_sev_es_hcall_prepare,
593 	.runtime.sev_es_hcall_finish	= vmware_sev_es_hcall_finish,
594 #endif
595 };
596