xref: /illumos-gate/usr/src/uts/intel/os/hma.c (revision 09ea9c53cd9ac02c506f68475d98e8f07b457ffc)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
15  * Copyright 2024 Oxide Computer Company
16  */
17 
18 #include <sys/cpuvar.h>
19 #include <sys/types.h>
20 #include <sys/errno.h>
21 #include <sys/machsystm.h>
22 #include <sys/archsystm.h>
23 #include <sys/controlregs.h>
24 #include <sys/x86_archext.h>
25 #include <sys/id_space.h>
26 #include <sys/hma.h>
27 #include <sys/cmn_err.h>
28 #include <vm/hat.h>
29 #include <vm/as.h>
30 
31 struct hma_reg {
32 	const char	*hr_name;
33 	list_node_t	hr_node;
34 };
35 
36 static kmutex_t hma_lock;
37 static list_t hma_registrations;
38 static boolean_t hma_exclusive = B_FALSE;
39 int hma_disable = 0;
40 
41 typedef enum hma_cpu_status {
42 	HCS_UNINITIALIZED = 0,
43 	HCS_READY,
44 	HCS_ERROR
45 } hma_cpu_status_t;
46 
47 /*
48  * When both host and guest want simultaneous use of the CPU performance
49  * counters, which should take priority?
50  *
51  * Defer to the guest by default, making its activity invisible to
52  * host-configured CPC measurements.  This is necessary since the Capacity &
53  * Utilization system keeps the CPCs active at all times when not in use by
54  * libcpc or dtrace users.
55  */
56 typedef enum hma_cpc_priority {
57 	HCP_HOST_WINS = 0,
58 	HCP_GUEST_WINS = 1,
59 } hma_cpc_priority_t;
60 static hma_cpc_priority_t hma_cpc_priority = HCP_GUEST_WINS;
61 
62 /*
63  * VMX-specific per-CPU data
64  */
65 typedef struct hma_vmx_cpu {
66 	void		*hvc_vmxon_page;
67 	uintptr_t	hvc_vmxon_pa;
68 
69 } hma_vmx_cpu_t;
70 
71 /*
72  * SVM-specific per-CPU data
73  */
74 typedef struct hma_svm_cpu {
75 	void		*hsc_hsave_page;
76 	uintptr_t	hsc_hsave_pa;
77 	hma_svm_asid_t	hsc_asid;
78 	uint_t		hsc_gif_disabled;
79 	/*
80 	 * hsc_cpc_saved_flags stores the state of guest performance counters
81 	 * while inside the hma_svm_cpc_enter/hma_svm_cpc_exit critical section.
82 	 *
83 	 * If, due to the state of host counters, requested guest counters, and
84 	 * hma_cpc_priority, the guest counters are _not_ loaded during
85 	 * hma_svm_cpc_enter(), then this field will hold HCF_DISABLED,
86 	 * indicating that no state restoration is required during
87 	 * hma_svm_cpc_exit().
88 	 *
89 	 * When hsc_cpc_saved_flags is not HCF_DISABLED, then hsc_cpc_host_regs
90 	 * will hold the saved host CPC state while the guest state occupies
91 	 * those registers in the CPU.
92 	 */
93 	hma_cpc_flags_t	hsc_cpc_saved_flags;
94 	hma_cpc_t	hsc_cpc_host_regs[6];
95 } hma_svm_cpu_t;
96 
97 /*
98  * Combined per-CPU state data
99  *
100  * The bulk of HMA state (VMX & SVM) is protected by cpu_lock, rather than a
101  * mutex specific to the module.  It (cpu_lock) is already required for the
102  * state needed to perform setup on all CPUs, so it was a natural fit to
103  * protect this data too.
104  */
105 struct hma_cpu {
106 	union {
107 		struct hma_vmx_cpu vmx;
108 		struct hma_svm_cpu svm;
109 	} hc_u;
110 	hma_cpu_status_t	hc_status;
111 	uintptr_t		_hc_padding[6];
112 } hma_cpu[NCPU];
113 
114 /* Keep per-CPU state aligned to cache line size to avoid false sharing */
115 CTASSERT(sizeof (struct hma_cpu) % _CACHE_LINE_SIZE == 0);
116 
117 
118 static boolean_t hma_vmx_ready = B_FALSE;
119 static const char *hma_vmx_error = NULL;
120 static id_space_t *hma_vmx_vpid;
121 
122 /* HMA-internal tracking of optional VMX capabilities */
123 typedef enum {
124 	HVC_EPT		= (1 << 0),
125 	HVC_VPID	= (1 << 1),
126 	HVC_INVEPT_ONE	= (1 << 2),
127 	HVC_INVEPT_ALL	= (1 << 3),
128 } hma_vmx_capab_t;
129 
130 static uint32_t hma_vmx_revision;
131 static hma_vmx_capab_t hma_vmx_capabs = 0;
132 
133 static boolean_t hma_svm_ready = B_FALSE;
134 static const char *hma_svm_error = NULL;
135 static uint32_t hma_svm_features;
136 static uint32_t hma_svm_max_asid;
137 static hma_cpc_flags_t hma_svm_cpc_allowed = HCF_DISABLED;
138 
139 static int hma_vmx_init(void);
140 static int hma_svm_init(void);
141 
142 /* Helpers from ml/hma_asm.s */
143 int hma_vmx_do_invept(int, uintptr_t);
144 int hma_vmx_vmxon(uintptr_t);
145 
146 void
hma_init(void)147 hma_init(void)
148 {
149 	mutex_init(&hma_lock, NULL, MUTEX_DEFAULT, NULL);
150 	list_create(&hma_registrations, sizeof (struct hma_reg),
151 	    offsetof(struct hma_reg, hr_node));
152 
153 	if (hma_disable != 0) {
154 		cmn_err(CE_CONT, "?hma_init: disabled");
155 		return;
156 	}
157 
158 	switch (cpuid_getvendor(CPU)) {
159 	case X86_VENDOR_Intel:
160 		(void) hma_vmx_init();
161 		break;
162 	case X86_VENDOR_AMD:
163 	case X86_VENDOR_HYGON:
164 		(void) hma_svm_init();
165 		break;
166 	default:
167 		break;
168 	}
169 }
170 
171 static hma_reg_t *
hma_register_backend(const char * name)172 hma_register_backend(const char *name)
173 {
174 	struct hma_reg *reg;
175 	boolean_t is_ready;
176 
177 	ASSERT(MUTEX_HELD(&hma_lock));
178 
179 	switch (cpuid_getvendor(CPU)) {
180 	case X86_VENDOR_Intel:
181 		is_ready = hma_vmx_ready;
182 		break;
183 	case X86_VENDOR_AMD:
184 	case X86_VENDOR_HYGON:
185 		is_ready = hma_svm_ready;
186 		break;
187 	default:
188 		is_ready = B_FALSE;
189 		break;
190 	}
191 
192 	if (!is_ready)
193 		return (NULL);
194 
195 	reg = kmem_zalloc(sizeof (*reg), KM_SLEEP);
196 	reg->hr_name = name;
197 	list_insert_tail(&hma_registrations, reg);
198 
199 	return (reg);
200 }
201 
202 hma_reg_t *
hma_register(const char * name)203 hma_register(const char *name)
204 {
205 	struct hma_reg *reg = NULL;
206 
207 	VERIFY(name != NULL);
208 
209 	mutex_enter(&hma_lock);
210 
211 	if (!hma_exclusive)
212 		reg = hma_register_backend(name);
213 
214 	mutex_exit(&hma_lock);
215 
216 	return (reg);
217 }
218 
219 hma_reg_t *
hma_register_exclusive(const char * name)220 hma_register_exclusive(const char *name)
221 {
222 	struct hma_reg *reg = NULL;
223 
224 	VERIFY(name != NULL);
225 
226 	mutex_enter(&hma_lock);
227 
228 	if (list_is_empty(&hma_registrations)) {
229 		reg = hma_register_backend(name);
230 		if (reg != NULL)
231 			hma_exclusive = B_TRUE;
232 	}
233 
234 	mutex_exit(&hma_lock);
235 
236 	return (reg);
237 }
238 
239 void
hma_unregister(hma_reg_t * reg)240 hma_unregister(hma_reg_t *reg)
241 {
242 	VERIFY(reg != NULL);
243 	VERIFY(!list_is_empty(&hma_registrations));
244 
245 	mutex_enter(&hma_lock);
246 	list_remove(&hma_registrations, reg);
247 	if (hma_exclusive && list_is_empty(&hma_registrations))
248 		hma_exclusive = B_FALSE;
249 	mutex_exit(&hma_lock);
250 	kmem_free(reg, sizeof (*reg));
251 }
252 
253 static __inline hma_vmx_cpu_t *
hma_vmx_cpu(processorid_t id)254 hma_vmx_cpu(processorid_t id)
255 {
256 	return (&hma_cpu[id].hc_u.vmx);
257 }
258 
259 static __inline hma_svm_cpu_t *
hma_svm_cpu(processorid_t id)260 hma_svm_cpu(processorid_t id)
261 {
262 	return (&hma_cpu[id].hc_u.svm);
263 }
264 
265 /*
266  * VPID 0 is reserved for instances where VPID is disabled.  Some hypervisors
267  * (read: bhyve) reserve lower-order VPIDs for use in fallback behavior if
268  * unique VPIDs could not be allocated for all the vCPUs belonging to a VM.
269  */
270 #define	HMA_VPID_RESERVED	NCPU
271 
272 uint16_t
hma_vmx_vpid_alloc(void)273 hma_vmx_vpid_alloc(void)
274 {
275 	id_t res;
276 
277 	/* Do not bother if the CPU lacks support */
278 	if ((hma_vmx_capabs & HVC_VPID) == 0) {
279 		return (0);
280 	}
281 
282 	res = id_alloc_nosleep(hma_vmx_vpid);
283 	if (res == -1) {
284 		return (0);
285 	} else {
286 		ASSERT(res > HMA_VPID_RESERVED && res <= UINT16_MAX);
287 		return (res);
288 	}
289 }
290 
291 void
hma_vmx_vpid_free(uint16_t vpid)292 hma_vmx_vpid_free(uint16_t vpid)
293 {
294 	VERIFY(vpid > HMA_VPID_RESERVED);
295 	id_free(hma_vmx_vpid, (id_t)vpid);
296 }
297 
298 #define	INVEPT_SINGLE_CONTEXT	1
299 #define	INVEPT_ALL_CONTEXTS	2
300 
301 static int
hma_vmx_invept_xcall(xc_arg_t arg1,xc_arg_t arg2,xc_arg_t arg3 __unused)302 hma_vmx_invept_xcall(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3 __unused)
303 {
304 	int flag = (int)arg1;
305 	uintptr_t eptp = (uintptr_t)arg2;
306 
307 	ASSERT(flag == INVEPT_SINGLE_CONTEXT || flag == INVEPT_ALL_CONTEXTS);
308 
309 	VERIFY0(hma_vmx_do_invept(flag, eptp));
310 	return (0);
311 }
312 
313 void
hma_vmx_invept_allcpus(uintptr_t eptp)314 hma_vmx_invept_allcpus(uintptr_t eptp)
315 {
316 	int flag = -1;
317 	cpuset_t set;
318 
319 	if ((hma_vmx_capabs & HVC_INVEPT_ONE) != 0) {
320 		flag = INVEPT_SINGLE_CONTEXT;
321 	} else if ((hma_vmx_capabs & HVC_INVEPT_ALL) != 0) {
322 		flag = INVEPT_ALL_CONTEXTS;
323 		eptp = 0;
324 	} else {
325 		return;
326 	}
327 
328 	cpuset_zero(&set);
329 	mutex_enter(&cpu_lock);
330 
331 	cpuset_or(&set, &cpu_active_set);
332 	xc_call((xc_arg_t)flag, (xc_arg_t)eptp, 0, CPUSET2BV(set),
333 	    hma_vmx_invept_xcall);
334 
335 	mutex_exit(&cpu_lock);
336 }
337 
338 static int
hma_vmx_cpu_vmxon(xc_arg_t arg1 __unused,xc_arg_t arg2 __unused,xc_arg_t arg3 __unused)339 hma_vmx_cpu_vmxon(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused,
340     xc_arg_t arg3 __unused)
341 {
342 	uint64_t fctrl;
343 	const processorid_t id = CPU->cpu_seqid;
344 	hma_vmx_cpu_t *vmx_cpu = hma_vmx_cpu(id);
345 
346 	VERIFY(vmx_cpu->hvc_vmxon_page != NULL);
347 	VERIFY(vmx_cpu->hvc_vmxon_pa != 0);
348 
349 	/*
350 	 * Ensure that the VMX support and lock bits are enabled in the
351 	 * feature-control MSR.
352 	 */
353 	fctrl = rdmsr(MSR_IA32_FEAT_CTRL);
354 	if ((fctrl & IA32_FEAT_CTRL_LOCK) == 0 ||
355 	    (fctrl & IA32_FEAT_CTRL_VMX_EN) == 0) {
356 		fctrl = fctrl | IA32_FEAT_CTRL_VMX_EN | IA32_FEAT_CTRL_LOCK;
357 		wrmsr(MSR_IA32_FEAT_CTRL, fctrl);
358 	}
359 
360 	setcr4(getcr4() | CR4_VMXE);
361 
362 	if (hma_vmx_vmxon(vmx_cpu->hvc_vmxon_pa) == 0) {
363 		hma_cpu[id].hc_status = HCS_READY;
364 	} else {
365 		hma_cpu[id].hc_status = HCS_ERROR;
366 
367 		/*
368 		 * If VMX has already been marked active and available for the
369 		 * system, then failure to perform VMXON on a newly-onlined CPU
370 		 * represents a fatal problem.  Continuing on would mean
371 		 * failure for any hypervisor thread which landed here.
372 		 */
373 		if (hma_vmx_ready) {
374 			panic("VMXON failure after VMX marked ready");
375 		}
376 	}
377 	return (0);
378 }
379 
380 static int
hma_vmx_cpu_setup(cpu_setup_t what,int id,void * arg __unused)381 hma_vmx_cpu_setup(cpu_setup_t what, int id, void *arg __unused)
382 {
383 	hma_vmx_cpu_t *vmx_cpu = hma_vmx_cpu(id);
384 
385 	ASSERT(MUTEX_HELD(&cpu_lock));
386 	ASSERT(id >= 0 && id < NCPU);
387 
388 	if (what != CPU_ON) {
389 		/*
390 		 * For the purposes of VMX setup, only the CPU_ON event is of
391 		 * interest.  Letting VMX state linger on an offline CPU should
392 		 * not cause any harm.
393 		 *
394 		 * This logic assumes that any offlining activity is strictly
395 		 * administrative in nature and will not alter any existing
396 		 * configuration (such as %cr4 bits previously set).
397 		 */
398 		return (0);
399 	}
400 
401 	const hma_cpu_status_t status = hma_cpu[id].hc_status;
402 	if (status == HCS_ERROR) {
403 		return (-1);
404 	}
405 
406 	/* Allocate the VMXON page for this CPU, if not already done */
407 	if (vmx_cpu->hvc_vmxon_page == NULL) {
408 		caddr_t va;
409 		pfn_t pfn;
410 
411 		va = kmem_alloc(PAGESIZE, KM_SLEEP);
412 		VERIFY0((uintptr_t)va & PAGEOFFSET);
413 		vmx_cpu->hvc_vmxon_page = va;
414 
415 		/* Initialize the VMX revision field as expected */
416 		bcopy(&hma_vmx_revision, va, sizeof (hma_vmx_revision));
417 
418 		/*
419 		 * Cache the physical address of the VMXON page rather than
420 		 * looking it up later when the potential blocking of
421 		 * hat_getpfnum would be less acceptable.
422 		 */
423 		pfn = hat_getpfnum(kas.a_hat, va);
424 		vmx_cpu->hvc_vmxon_pa = (pfn << PAGESHIFT);
425 	} else {
426 		VERIFY(vmx_cpu->hvc_vmxon_pa != 0);
427 	}
428 
429 	if (status == HCS_UNINITIALIZED) {
430 		cpuset_t set;
431 
432 		/* Activate VMX on this CPU */
433 		cpuset_zero(&set);
434 		cpuset_add(&set, id);
435 		xc_call(0, 0, 0, CPUSET2BV(set), hma_vmx_cpu_vmxon);
436 	} else {
437 		VERIFY3U(status, ==, HCS_READY);
438 
439 		/*
440 		 * If an already-initialized CPU is going back online, perform
441 		 * an all-contexts invept to eliminate the possibility of
442 		 * cached EPT state causing issues.
443 		 */
444 		if ((hma_vmx_capabs & HVC_INVEPT_ALL) != 0) {
445 			cpuset_t set;
446 
447 			cpuset_zero(&set);
448 			cpuset_add(&set, id);
449 			xc_call((xc_arg_t)INVEPT_ALL_CONTEXTS, 0, 0,
450 			    CPUSET2BV(set), hma_vmx_invept_xcall);
451 		}
452 	}
453 
454 	return (hma_cpu[id].hc_status != HCS_READY);
455 }
456 
457 /*
458  * Determining the availability of VM execution controls is somewhat different
459  * from conventional means, where one simply checks for asserted bits in the
460  * MSR value.  Instead, these execution control MSRs are split into two halves:
461  * the lower 32-bits indicating capabilities which can be zeroed in the VMCS
462  * field and the upper 32-bits indicating capabilities which can be set to one.
463  *
464  * It is described in detail in Appendix A.3 of SDM volume 3.
465  */
466 #define	VMX_CTL_ONE_SETTING(val, flag)	\
467 	(((val) & ((uint64_t)(flag) << 32)) != 0)
468 
469 static const char *
hma_vmx_query_details(void)470 hma_vmx_query_details(void)
471 {
472 	boolean_t query_true_ctl = B_FALSE;
473 	uint64_t msr;
474 
475 	/* The basic INS/OUTS functionality is cited as a necessary prereq */
476 	msr = rdmsr(MSR_IA32_VMX_BASIC);
477 	if ((msr & IA32_VMX_BASIC_INS_OUTS) == 0) {
478 		return ("VMX does not support INS/OUTS");
479 	}
480 
481 	/* Record the VMX revision for later VMXON usage */
482 	hma_vmx_revision = (uint32_t)msr;
483 
484 	/*
485 	 * Bit 55 in the VMX_BASIC MSR determines how VMX control information
486 	 * can be queried.
487 	 */
488 	query_true_ctl = (msr & IA32_VMX_BASIC_TRUE_CTRLS) != 0;
489 
490 	/* Check for EPT and VPID support */
491 	msr = rdmsr(query_true_ctl ?
492 	    MSR_IA32_VMX_TRUE_PROCBASED_CTLS : MSR_IA32_VMX_PROCBASED_CTLS);
493 	if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED_2ND_CTLS)) {
494 		msr = rdmsr(MSR_IA32_VMX_PROCBASED2_CTLS);
495 		if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED2_EPT)) {
496 			hma_vmx_capabs |= HVC_EPT;
497 		}
498 		if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED2_VPID)) {
499 			hma_vmx_capabs |= HVC_VPID;
500 		}
501 	}
502 
503 	/* Check for INVEPT support */
504 	if ((hma_vmx_capabs & HVC_EPT) != 0) {
505 		msr = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP);
506 		if ((msr & IA32_VMX_EPT_VPID_INVEPT) != 0) {
507 			if ((msr & IA32_VMX_EPT_VPID_INVEPT_SINGLE) != 0) {
508 				hma_vmx_capabs |= HVC_INVEPT_ONE;
509 			}
510 			if ((msr & IA32_VMX_EPT_VPID_INVEPT_ALL) != 0) {
511 				hma_vmx_capabs |= HVC_INVEPT_ALL;
512 			}
513 		}
514 	}
515 
516 	return (NULL);
517 }
518 
519 static int
hma_vmx_init(void)520 hma_vmx_init(void)
521 {
522 	cpu_t *cp;
523 	uint64_t msr;
524 	int err = 0;
525 	const char *msg = NULL;
526 
527 	if (!is_x86_feature(x86_featureset, X86FSET_VMX)) {
528 		msg = "CPU does not support VMX";
529 		goto bail;
530 	}
531 
532 	/* Has the BIOS set the feature-control lock bit without VMX enabled? */
533 	msr = rdmsr(MSR_IA32_FEAT_CTRL);
534 	if ((msr & IA32_FEAT_CTRL_LOCK) != 0 &&
535 	    (msr & IA32_FEAT_CTRL_VMX_EN) == 0) {
536 		msg = "VMX support disabled by BIOS";
537 		goto bail;
538 	}
539 
540 	msg = hma_vmx_query_details();
541 	if (msg != NULL) {
542 		goto bail;
543 	}
544 
545 	mutex_enter(&cpu_lock);
546 	/* Perform VMX configuration for already-online CPUs. */
547 	cp = cpu_active;
548 	do {
549 		err = hma_vmx_cpu_setup(CPU_ON, cp->cpu_seqid, NULL);
550 		if (err != 0) {
551 			msg = "failure during VMXON setup";
552 			mutex_exit(&cpu_lock);
553 			goto bail;
554 		}
555 	} while ((cp = cp->cpu_next_onln) != cpu_active);
556 
557 	/*
558 	 * Register callback for later-onlined CPUs and perform other remaining
559 	 * resource allocation.
560 	 */
561 	register_cpu_setup_func(hma_vmx_cpu_setup, NULL);
562 	mutex_exit(&cpu_lock);
563 
564 	hma_vmx_vpid = id_space_create("hma_vmx_vpid", HMA_VPID_RESERVED + 1,
565 	    UINT16_MAX);
566 	hma_vmx_ready = B_TRUE;
567 
568 	return (0);
569 
570 bail:
571 	hma_vmx_error = msg;
572 	cmn_err(CE_NOTE, "!hma_vmx_init: %s", msg);
573 	return (-1);
574 }
575 
576 #define	VMCB_FLUSH_NOTHING	0x0
577 #define	VMCB_FLUSH_ALL		0x1
578 #define	VMCB_FLUSH_ASID		0x3
579 
580 void
hma_svm_asid_init(hma_svm_asid_t * vcp)581 hma_svm_asid_init(hma_svm_asid_t *vcp)
582 {
583 	/*
584 	 * Initialize the generation to 0, forcing an ASID allocation on first
585 	 * entry.  Leave the ASID at 0, so if the host forgoes the call to
586 	 * hma_svm_asid_update(), SVM will bail on the invalid vcpu state.
587 	 */
588 	vcp->hsa_gen = 0;
589 	vcp->hsa_asid = 0;
590 }
591 
592 uint8_t
hma_svm_asid_update(hma_svm_asid_t * vcp,boolean_t flush_by_asid,boolean_t npt_flush)593 hma_svm_asid_update(hma_svm_asid_t *vcp, boolean_t flush_by_asid,
594     boolean_t npt_flush)
595 {
596 	/*
597 	 * Most ASID resource updates are expected to be performed as part of
598 	 * VMM entry into guest context, where interrupts would be disabled for
599 	 * the sake of state consistency.
600 	 *
601 	 * We demand this be the case, even though other situations which might
602 	 * incur an ASID update, such as userspace manipulation of guest vCPU
603 	 * state, may not require such consistency.
604 	 */
605 	ASSERT(!interrupts_enabled());
606 
607 	/*
608 	 * If NPT changes dictate a TLB flush and by-ASID flushing is not
609 	 * supported/used, force a fresh ASID allocation.
610 	 */
611 	if (npt_flush && !flush_by_asid) {
612 		vcp->hsa_gen = 0;
613 	}
614 
615 	hma_svm_asid_t *hcp = &(hma_svm_cpu(CPU->cpu_seqid)->hsc_asid);
616 	if (vcp->hsa_gen != hcp->hsa_gen) {
617 		hcp->hsa_asid++;
618 
619 		if (hcp->hsa_asid >= hma_svm_max_asid) {
620 			/* Keep the ASID properly constrained */
621 			hcp->hsa_asid = 1;
622 			hcp->hsa_gen++;
623 			if (hcp->hsa_gen == 0) {
624 				/*
625 				 * Stay clear of the '0' sentinel value for
626 				 * generation, if wrapping around.
627 				 */
628 				hcp->hsa_gen = 1;
629 			}
630 		}
631 		vcp->hsa_gen = hcp->hsa_gen;
632 		vcp->hsa_asid = hcp->hsa_asid;
633 
634 		ASSERT(vcp->hsa_asid != 0);
635 		ASSERT3U(vcp->hsa_asid, <, hma_svm_max_asid);
636 
637 		if (flush_by_asid) {
638 			return (VMCB_FLUSH_ASID);
639 		} else {
640 			return (VMCB_FLUSH_ALL);
641 		}
642 	} else if (npt_flush) {
643 		ASSERT(flush_by_asid);
644 		return (VMCB_FLUSH_ASID);
645 	}
646 
647 	return (VMCB_FLUSH_NOTHING);
648 }
649 
650 void
hma_svm_gif_disable(void)651 hma_svm_gif_disable(void)
652 {
653 	/*
654 	 * Clear the GIF (masking interrupts) first, so the subsequent
655 	 * housekeeping can be done under its protection.
656 	 */
657 	__asm__ __volatile__("clgi");
658 
659 	hma_svm_cpu_t *svm_cpu = hma_svm_cpu(CPU->cpu_seqid);
660 	const uint_t old_gif = atomic_swap_uint(&svm_cpu->hsc_gif_disabled, 1);
661 
662 	if (old_gif != 0) {
663 		panic("GIF disable is set when expected to be clear");
664 	}
665 }
666 
667 void
hma_svm_gif_enable(void)668 hma_svm_gif_enable(void)
669 {
670 	hma_svm_cpu_t *svm_cpu = hma_svm_cpu(CPU->cpu_seqid);
671 	const uint_t old_gif = atomic_swap_uint(&svm_cpu->hsc_gif_disabled, 0);
672 
673 	if (old_gif == 0) {
674 		panic("GIF disable is clear when expected to be set");
675 	}
676 
677 	/*
678 	 * Set the GIF last (un-masking interrupts) last, so the housekeeping
679 	 * will have been completed under its protection.
680 	 */
681 	__asm__ __volatile__("stgi");
682 }
683 
684 boolean_t
hma_svm_gif_is_disabled(void)685 hma_svm_gif_is_disabled(void)
686 {
687 	hma_svm_cpu_t *svm_cpu = hma_svm_cpu(CPU->cpu_seqid);
688 
689 	/*
690 	 * At the time of this writing, there exists no mechanism by which the
691 	 * state of the GIF on a CPU can be directly queried.  Rather than
692 	 * attempting an indirect means of checking its state, we track it
693 	 * manually through the HMA disable/enable functions.
694 	 */
695 	return (svm_cpu->hsc_gif_disabled != 0);
696 }
697 
698 #define	EVTSEL_EN(evt) (((evt) & AMD_PERF_EVTSEL_CTR_EN) != 0)
699 #define	CPC_BASE_REGS	4
700 #define	CPC_EXTD_REGS	6
701 #define	MSR_CPC_EXTD_EVTSEL(idx)	(MSR_AMD_F15H_PERF_EVTSEL0 + (idx * 2))
702 #define	MSR_CPC_EXTD_CTR(idx)		(MSR_AMD_F15H_PERF_CTR0 + (idx * 2))
703 
704 /*
705  * AMD CPU Performance Counter Support
706  *
707  * This provides a means of safely saving/loading host CPC state, along with
708  * loading/saving guest CPC state upon guest entry/exit (respectively).
709  * Currently, this only supports the 6 "extended" performance counters
710  * (in MSRs C0010200h - C001020bh).  It pays no head to any other CPC state such
711  * as the Northbridge counters or PerfMonV2 registers.
712  */
713 
714 hma_svm_cpc_res_t
hma_svm_cpc_enter(struct hma_svm_cpc_state * cpc_state)715 hma_svm_cpc_enter(struct hma_svm_cpc_state *cpc_state)
716 {
717 	hma_svm_cpu_t *svm_cpu = hma_svm_cpu(CPU->cpu_seqid);
718 
719 	ASSERT(!interrupts_enabled());
720 
721 	svm_cpu->hsc_cpc_saved_flags = HCF_DISABLED;
722 
723 	const hma_cpc_flags_t req_flags =
724 	    cpc_state->hscs_flags & hma_svm_cpc_allowed;
725 	if (req_flags == HCF_DISABLED) {
726 		return (HSCR_EMPTY);
727 	}
728 
729 	/* Extended regs should not be enabled without base */
730 	IMPLY((req_flags & HCF_EN_EXTD) != 0, (req_flags & HCF_EN_BASE) != 0);
731 
732 	const uint_t max_guest_reg =
733 	    (req_flags & HCF_EN_EXTD) != 0 ? CPC_EXTD_REGS : CPC_BASE_REGS;
734 	uint_t guest_active = 0;
735 	for (uint_t i = 0; i < max_guest_reg; i++) {
736 		if (EVTSEL_EN(cpc_state->hscs_regs[i].hc_evtsel)) {
737 			guest_active++;
738 		}
739 	}
740 
741 	/*
742 	 * Guest is not currently measuring with any of the CPCs, so leave any
743 	 * host counters in place.
744 	 */
745 	if (guest_active == 0) {
746 		return (HSCR_EMPTY);
747 	}
748 
749 	/*
750 	 * Read (and save) the host evtsel values, counting the number of
751 	 * registers in active use
752 	 */
753 	uint_t host_active = 0;
754 	for (uint_t i = 0; i < CPC_EXTD_REGS; i++) {
755 		const uint64_t evtsel = rdmsr(MSR_CPC_EXTD_EVTSEL(i));
756 
757 		svm_cpu->hsc_cpc_host_regs[i].hc_evtsel = evtsel;
758 		if (EVTSEL_EN(evtsel)) {
759 			host_active++;
760 		}
761 	}
762 
763 	if (host_active != 0) {
764 		if (hma_cpc_priority == HCP_HOST_WINS) {
765 			/*
766 			 * Host has priority access to the perf counters over
767 			 * the guest, so just leave everything in place.
768 			 */
769 			DTRACE_PROBE2(hma_svm__guest_deferred,
770 			    processorid_t, CPU->cpu_seqid,
771 			    uint_t, guest_active);
772 			return (HSCR_EMPTY);
773 		}
774 
775 		DTRACE_PROBE2(hma_svm__host_deferred,
776 		    processorid_t, CPU->cpu_seqid, uint_t, host_active);
777 
778 		/*
779 		 * Disable any active host counters, trying to do so in as
780 		 * consistent a manner as possible.
781 		 */
782 		for (uint_t i = 0; i < CPC_EXTD_REGS; i++) {
783 			const uint64_t evtsel =
784 			    svm_cpu->hsc_cpc_host_regs[i].hc_evtsel;
785 			wrmsr(MSR_CPC_EXTD_EVTSEL(i),
786 			    evtsel & ~AMD_PERF_EVTSEL_CTR_EN);
787 		}
788 	}
789 
790 	/*
791 	 * With any active host counters stopped from collecting new events,
792 	 * save the counter values themselves before loading guest state.
793 	 */
794 	for (uint_t i = 0; i < CPC_EXTD_REGS; i++) {
795 		svm_cpu->hsc_cpc_host_regs[i].hc_ctr =
796 		    rdmsr(MSR_CPC_EXTD_CTR(i));
797 	}
798 
799 	/*
800 	 * Now load the guest state, fixing it up with the flag necessary to
801 	 * collect events only while in guest context.
802 	 */
803 	for (uint_t i = 0; i < max_guest_reg; i++) {
804 		uint64_t evtsel = cpc_state->hscs_regs[i].hc_evtsel;
805 
806 		/*
807 		 * Clear any existing HG flags, as well as any request for
808 		 * interrupt enable. (Trapping the interrupt from guest counters
809 		 * is not presently supported.)
810 		 */
811 		evtsel &= ~(AMD_PERF_EVTSEL_HG_MASK | AMD_PERF_EVTSEL_INT_EN);
812 		/* And indicate guest-only event tracking */
813 		evtsel |= AMD_PERF_EVTSEL_HG_GUEST;
814 
815 		wrmsr(MSR_CPC_EXTD_EVTSEL(i), evtsel);
816 		wrmsr(MSR_CPC_EXTD_CTR(i), cpc_state->hscs_regs[i].hc_ctr);
817 	}
818 
819 	svm_cpu->hsc_cpc_saved_flags = req_flags;
820 	return (HSCR_ACCESS_RDPMC | HSCR_ACCESS_CTR_MSR);
821 }
822 
823 void
hma_svm_cpc_exit(struct hma_svm_cpc_state * cpc_state)824 hma_svm_cpc_exit(struct hma_svm_cpc_state *cpc_state)
825 {
826 	ASSERT(!interrupts_enabled());
827 
828 	hma_svm_cpu_t *svm_cpu = hma_svm_cpu(CPU->cpu_seqid);
829 
830 	const hma_cpc_flags_t saved_flags = svm_cpu->hsc_cpc_saved_flags;
831 	if (saved_flags == HCF_DISABLED) {
832 		return;
833 	}
834 
835 	/* Save the guest counter values. */
836 	const uint_t max_guest_reg =
837 	    (saved_flags & HCF_EN_EXTD) != 0 ? CPC_EXTD_REGS : CPC_BASE_REGS;
838 	for (uint_t i = 0; i < max_guest_reg; i++) {
839 		cpc_state->hscs_regs[i].hc_ctr = rdmsr(MSR_CPC_EXTD_CTR(i));
840 	}
841 
842 	/*
843 	 * Load the host values back, once again taking care to toggle the
844 	 * counter enable state as a separate step in an attempt to keep
845 	 * readings as consistent as possible
846 	 */
847 	uint_t host_active = 0;
848 	for (uint_t i = 0; i < CPC_EXTD_REGS; i++) {
849 		const uint64_t evtsel = svm_cpu->hsc_cpc_host_regs[i].hc_evtsel;
850 
851 		if (EVTSEL_EN(evtsel)) {
852 			host_active++;
853 		}
854 		wrmsr(MSR_CPC_EXTD_EVTSEL(i), evtsel & ~AMD_PERF_EVTSEL_CTR_EN);
855 		wrmsr(MSR_CPC_EXTD_CTR(i),
856 		    svm_cpu->hsc_cpc_host_regs[i].hc_ctr);
857 	}
858 
859 	/*
860 	 * Allow any enabled host counters to collect events, now that all of
861 	 * the other state is loaded.
862 	 */
863 	if (host_active != 0) {
864 		for (uint_t i = 0; i < CPC_EXTD_REGS; i++) {
865 			wrmsr(MSR_CPC_EXTD_EVTSEL(i),
866 			    svm_cpu->hsc_cpc_host_regs[i].hc_evtsel);
867 		}
868 	}
869 }
870 
871 static int
hma_svm_cpu_activate(xc_arg_t arg1 __unused,xc_arg_t arg2 __unused,xc_arg_t arg3 __unused)872 hma_svm_cpu_activate(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused,
873     xc_arg_t arg3 __unused)
874 {
875 	const processorid_t id = CPU->cpu_seqid;
876 	const uintptr_t hsave_pa = hma_svm_cpu(id)->hsc_hsave_pa;
877 	uint64_t efer;
878 
879 	VERIFY(hsave_pa != 0);
880 
881 	/* Enable SVM via EFER */
882 	efer = rdmsr(MSR_AMD_EFER);
883 	efer |= AMD_EFER_SVME;
884 	wrmsr(MSR_AMD_EFER, efer);
885 
886 	/* Setup hsave area */
887 	wrmsr(MSR_AMD_VM_HSAVE_PA, hsave_pa);
888 
889 	hma_cpu[id].hc_status = HCS_READY;
890 	return (0);
891 }
892 
893 static int
hma_svm_cpu_setup(cpu_setup_t what,int id,void * arg __unused)894 hma_svm_cpu_setup(cpu_setup_t what, int id, void *arg __unused)
895 {
896 	hma_svm_cpu_t *svm_cpu = hma_svm_cpu(id);
897 
898 	ASSERT(MUTEX_HELD(&cpu_lock));
899 	ASSERT(id >= 0 && id < NCPU);
900 
901 	switch (what) {
902 	case CPU_CONFIG:
903 	case CPU_ON:
904 	case CPU_INIT:
905 		break;
906 	default:
907 		/*
908 		 * Other events, such as CPU offlining, are of no interest.
909 		 * Letting the SVM state linger should not cause any harm.
910 		 *
911 		 * This logic assumes that any offlining activity is strictly
912 		 * administrative in nature and will not alter any existing
913 		 * configuration (such as EFER bits previously set).
914 		 */
915 		return (0);
916 	}
917 
918 	/* Perform initialization if it has not been previously attempted. */
919 	if (hma_cpu[id].hc_status != HCS_UNINITIALIZED) {
920 		return ((hma_cpu[id].hc_status == HCS_READY) ? 0 : -1);
921 	}
922 
923 	/* Allocate the hsave page for this CPU */
924 	if (svm_cpu->hsc_hsave_page == NULL) {
925 		caddr_t va;
926 		pfn_t pfn;
927 
928 		va = kmem_alloc(PAGESIZE, KM_SLEEP);
929 		VERIFY0((uintptr_t)va & PAGEOFFSET);
930 		svm_cpu->hsc_hsave_page = va;
931 
932 		/*
933 		 * Cache the physical address of the hsave page rather than
934 		 * looking it up later when the potential blocking of
935 		 * hat_getpfnum would be less acceptable.
936 		 */
937 		pfn = hat_getpfnum(kas.a_hat, va);
938 		svm_cpu->hsc_hsave_pa = (pfn << PAGESHIFT);
939 	} else {
940 		VERIFY(svm_cpu->hsc_hsave_pa != 0);
941 	}
942 
943 	kpreempt_disable();
944 	if (CPU->cpu_seqid == id) {
945 		/* Perform svm setup directly if this CPU is the target */
946 		(void) hma_svm_cpu_activate(0, 0, 0);
947 		kpreempt_enable();
948 	} else {
949 		cpuset_t set;
950 
951 		/* Use a cross-call if a remote CPU is the target */
952 		kpreempt_enable();
953 		cpuset_zero(&set);
954 		cpuset_add(&set, id);
955 		xc_call(0, 0, 0, CPUSET2BV(set), hma_svm_cpu_activate);
956 	}
957 
958 	return (hma_cpu[id].hc_status != HCS_READY);
959 }
960 
961 static int
hma_svm_init(void)962 hma_svm_init(void)
963 {
964 	uint64_t msr;
965 	const char *msg = NULL;
966 	struct cpuid_regs regs;
967 	cpu_t *cp;
968 
969 	if (!is_x86_feature(x86_featureset, X86FSET_SVM)) {
970 		msg = "CPU does not support SVM";
971 		goto bail;
972 	}
973 
974 	msr = rdmsr(MSR_AMD_VM_CR);
975 	if ((msr & AMD_VM_CR_SVMDIS) != 0) {
976 		msg = "SVM disabled by BIOS";
977 		goto bail;
978 	}
979 
980 	regs.cp_eax = 0x8000000a;
981 	(void) cpuid_insn(NULL, &regs);
982 	const uint32_t nasid = regs.cp_ebx;
983 	const uint32_t feat = regs.cp_edx;
984 
985 	if (nasid == 0) {
986 		msg = "Not enough ASIDs for guests";
987 		goto bail;
988 	}
989 	if ((feat & CPUID_AMD_EDX_NESTED_PAGING) == 0) {
990 		msg = "CPU does not support nested paging";
991 		goto bail;
992 	}
993 	if ((feat & CPUID_AMD_EDX_NRIPS) == 0) {
994 		msg = "CPU does not support NRIP save";
995 		goto bail;
996 	}
997 
998 	hma_svm_features = feat;
999 	hma_svm_max_asid = nasid;
1000 
1001 	mutex_enter(&cpu_lock);
1002 	/* Perform SVM configuration for already-online CPUs. */
1003 	cp = cpu_active;
1004 	do {
1005 		int err = hma_svm_cpu_setup(CPU_ON, cp->cpu_seqid, NULL);
1006 		if (err != 0) {
1007 			msg = "failure during SVM setup";
1008 			mutex_exit(&cpu_lock);
1009 			goto bail;
1010 		}
1011 	} while ((cp = cp->cpu_next_onln) != cpu_active);
1012 
1013 	/*
1014 	 * Register callback for later-onlined CPUs and perform other remaining
1015 	 * resource allocation.
1016 	 */
1017 	register_cpu_setup_func(hma_svm_cpu_setup, NULL);
1018 	mutex_exit(&cpu_lock);
1019 
1020 	/* Initialize per-CPU ASID state. */
1021 	for (uint_t i = 0; i < NCPU; i++) {
1022 		/*
1023 		 * Skip past sentinel 0 value for generation.  Doing so for
1024 		 * ASID is unneeded, since it will be incremented during the
1025 		 * first allocation.
1026 		 */
1027 		hma_svm_asid_t *cpu_asid = &hma_svm_cpu(i)->hsc_asid;
1028 		cpu_asid->hsa_gen = 1;
1029 		cpu_asid->hsa_asid = 0;
1030 	}
1031 
1032 	/*
1033 	 * For now, only expose performance counter support if the host supports
1034 	 * "extended" counters.  This makes MSR access more consistent for logic
1035 	 * handling that state.
1036 	 */
1037 	if (is_x86_feature(x86_featureset, X86FSET_AMD_PCEC)) {
1038 		hma_svm_cpc_allowed = HCF_EN_BASE | HCF_EN_EXTD;
1039 	}
1040 
1041 	hma_svm_ready = B_TRUE;
1042 	return (0);
1043 
1044 bail:
1045 	hma_svm_error = msg;
1046 	cmn_err(CE_NOTE, "!hma_svm_init: %s", msg);
1047 	return (-1);
1048 }
1049