1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2019 Joyent, Inc.
14 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
15 * Copyright 2024 Oxide Computer Company
16 */
17
18 #include <sys/cpuvar.h>
19 #include <sys/types.h>
20 #include <sys/errno.h>
21 #include <sys/machsystm.h>
22 #include <sys/archsystm.h>
23 #include <sys/controlregs.h>
24 #include <sys/x86_archext.h>
25 #include <sys/id_space.h>
26 #include <sys/hma.h>
27 #include <sys/cmn_err.h>
28 #include <vm/hat.h>
29 #include <vm/as.h>
30
31 struct hma_reg {
32 const char *hr_name;
33 list_node_t hr_node;
34 };
35
36 static kmutex_t hma_lock;
37 static list_t hma_registrations;
38 static boolean_t hma_exclusive = B_FALSE;
39 int hma_disable = 0;
40
41 typedef enum hma_cpu_status {
42 HCS_UNINITIALIZED = 0,
43 HCS_READY,
44 HCS_ERROR
45 } hma_cpu_status_t;
46
47 /*
48 * When both host and guest want simultaneous use of the CPU performance
49 * counters, which should take priority?
50 *
51 * Defer to the guest by default, making its activity invisible to
52 * host-configured CPC measurements. This is necessary since the Capacity &
53 * Utilization system keeps the CPCs active at all times when not in use by
54 * libcpc or dtrace users.
55 */
56 typedef enum hma_cpc_priority {
57 HCP_HOST_WINS = 0,
58 HCP_GUEST_WINS = 1,
59 } hma_cpc_priority_t;
60 static hma_cpc_priority_t hma_cpc_priority = HCP_GUEST_WINS;
61
62 /*
63 * VMX-specific per-CPU data
64 */
65 typedef struct hma_vmx_cpu {
66 void *hvc_vmxon_page;
67 uintptr_t hvc_vmxon_pa;
68
69 } hma_vmx_cpu_t;
70
71 /*
72 * SVM-specific per-CPU data
73 */
74 typedef struct hma_svm_cpu {
75 void *hsc_hsave_page;
76 uintptr_t hsc_hsave_pa;
77 hma_svm_asid_t hsc_asid;
78 uint_t hsc_gif_disabled;
79 /*
80 * hsc_cpc_saved_flags stores the state of guest performance counters
81 * while inside the hma_svm_cpc_enter/hma_svm_cpc_exit critical section.
82 *
83 * If, due to the state of host counters, requested guest counters, and
84 * hma_cpc_priority, the guest counters are _not_ loaded during
85 * hma_svm_cpc_enter(), then this field will hold HCF_DISABLED,
86 * indicating that no state restoration is required during
87 * hma_svm_cpc_exit().
88 *
89 * When hsc_cpc_saved_flags is not HCF_DISABLED, then hsc_cpc_host_regs
90 * will hold the saved host CPC state while the guest state occupies
91 * those registers in the CPU.
92 */
93 hma_cpc_flags_t hsc_cpc_saved_flags;
94 hma_cpc_t hsc_cpc_host_regs[6];
95 } hma_svm_cpu_t;
96
97 /*
98 * Combined per-CPU state data
99 *
100 * The bulk of HMA state (VMX & SVM) is protected by cpu_lock, rather than a
101 * mutex specific to the module. It (cpu_lock) is already required for the
102 * state needed to perform setup on all CPUs, so it was a natural fit to
103 * protect this data too.
104 */
105 struct hma_cpu {
106 union {
107 struct hma_vmx_cpu vmx;
108 struct hma_svm_cpu svm;
109 } hc_u;
110 hma_cpu_status_t hc_status;
111 uintptr_t _hc_padding[6];
112 } hma_cpu[NCPU];
113
114 /* Keep per-CPU state aligned to cache line size to avoid false sharing */
115 CTASSERT(sizeof (struct hma_cpu) % _CACHE_LINE_SIZE == 0);
116
117
118 static boolean_t hma_vmx_ready = B_FALSE;
119 static const char *hma_vmx_error = NULL;
120 static id_space_t *hma_vmx_vpid;
121
122 /* HMA-internal tracking of optional VMX capabilities */
123 typedef enum {
124 HVC_EPT = (1 << 0),
125 HVC_VPID = (1 << 1),
126 HVC_INVEPT_ONE = (1 << 2),
127 HVC_INVEPT_ALL = (1 << 3),
128 } hma_vmx_capab_t;
129
130 static uint32_t hma_vmx_revision;
131 static hma_vmx_capab_t hma_vmx_capabs = 0;
132
133 static boolean_t hma_svm_ready = B_FALSE;
134 static const char *hma_svm_error = NULL;
135 static uint32_t hma_svm_features;
136 static uint32_t hma_svm_max_asid;
137 static hma_cpc_flags_t hma_svm_cpc_allowed = HCF_DISABLED;
138
139 static int hma_vmx_init(void);
140 static int hma_svm_init(void);
141
142 /* Helpers from ml/hma_asm.s */
143 int hma_vmx_do_invept(int, uintptr_t);
144 int hma_vmx_vmxon(uintptr_t);
145
146 void
hma_init(void)147 hma_init(void)
148 {
149 mutex_init(&hma_lock, NULL, MUTEX_DEFAULT, NULL);
150 list_create(&hma_registrations, sizeof (struct hma_reg),
151 offsetof(struct hma_reg, hr_node));
152
153 if (hma_disable != 0) {
154 cmn_err(CE_CONT, "?hma_init: disabled");
155 return;
156 }
157
158 switch (cpuid_getvendor(CPU)) {
159 case X86_VENDOR_Intel:
160 (void) hma_vmx_init();
161 break;
162 case X86_VENDOR_AMD:
163 case X86_VENDOR_HYGON:
164 (void) hma_svm_init();
165 break;
166 default:
167 break;
168 }
169 }
170
171 static hma_reg_t *
hma_register_backend(const char * name)172 hma_register_backend(const char *name)
173 {
174 struct hma_reg *reg;
175 boolean_t is_ready;
176
177 ASSERT(MUTEX_HELD(&hma_lock));
178
179 switch (cpuid_getvendor(CPU)) {
180 case X86_VENDOR_Intel:
181 is_ready = hma_vmx_ready;
182 break;
183 case X86_VENDOR_AMD:
184 case X86_VENDOR_HYGON:
185 is_ready = hma_svm_ready;
186 break;
187 default:
188 is_ready = B_FALSE;
189 break;
190 }
191
192 if (!is_ready)
193 return (NULL);
194
195 reg = kmem_zalloc(sizeof (*reg), KM_SLEEP);
196 reg->hr_name = name;
197 list_insert_tail(&hma_registrations, reg);
198
199 return (reg);
200 }
201
202 hma_reg_t *
hma_register(const char * name)203 hma_register(const char *name)
204 {
205 struct hma_reg *reg = NULL;
206
207 VERIFY(name != NULL);
208
209 mutex_enter(&hma_lock);
210
211 if (!hma_exclusive)
212 reg = hma_register_backend(name);
213
214 mutex_exit(&hma_lock);
215
216 return (reg);
217 }
218
219 hma_reg_t *
hma_register_exclusive(const char * name)220 hma_register_exclusive(const char *name)
221 {
222 struct hma_reg *reg = NULL;
223
224 VERIFY(name != NULL);
225
226 mutex_enter(&hma_lock);
227
228 if (list_is_empty(&hma_registrations)) {
229 reg = hma_register_backend(name);
230 if (reg != NULL)
231 hma_exclusive = B_TRUE;
232 }
233
234 mutex_exit(&hma_lock);
235
236 return (reg);
237 }
238
239 void
hma_unregister(hma_reg_t * reg)240 hma_unregister(hma_reg_t *reg)
241 {
242 VERIFY(reg != NULL);
243 VERIFY(!list_is_empty(&hma_registrations));
244
245 mutex_enter(&hma_lock);
246 list_remove(&hma_registrations, reg);
247 if (hma_exclusive && list_is_empty(&hma_registrations))
248 hma_exclusive = B_FALSE;
249 mutex_exit(&hma_lock);
250 kmem_free(reg, sizeof (*reg));
251 }
252
253 static __inline hma_vmx_cpu_t *
hma_vmx_cpu(processorid_t id)254 hma_vmx_cpu(processorid_t id)
255 {
256 return (&hma_cpu[id].hc_u.vmx);
257 }
258
259 static __inline hma_svm_cpu_t *
hma_svm_cpu(processorid_t id)260 hma_svm_cpu(processorid_t id)
261 {
262 return (&hma_cpu[id].hc_u.svm);
263 }
264
265 /*
266 * VPID 0 is reserved for instances where VPID is disabled. Some hypervisors
267 * (read: bhyve) reserve lower-order VPIDs for use in fallback behavior if
268 * unique VPIDs could not be allocated for all the vCPUs belonging to a VM.
269 */
270 #define HMA_VPID_RESERVED NCPU
271
272 uint16_t
hma_vmx_vpid_alloc(void)273 hma_vmx_vpid_alloc(void)
274 {
275 id_t res;
276
277 /* Do not bother if the CPU lacks support */
278 if ((hma_vmx_capabs & HVC_VPID) == 0) {
279 return (0);
280 }
281
282 res = id_alloc_nosleep(hma_vmx_vpid);
283 if (res == -1) {
284 return (0);
285 } else {
286 ASSERT(res > HMA_VPID_RESERVED && res <= UINT16_MAX);
287 return (res);
288 }
289 }
290
291 void
hma_vmx_vpid_free(uint16_t vpid)292 hma_vmx_vpid_free(uint16_t vpid)
293 {
294 VERIFY(vpid > HMA_VPID_RESERVED);
295 id_free(hma_vmx_vpid, (id_t)vpid);
296 }
297
298 #define INVEPT_SINGLE_CONTEXT 1
299 #define INVEPT_ALL_CONTEXTS 2
300
301 static int
hma_vmx_invept_xcall(xc_arg_t arg1,xc_arg_t arg2,xc_arg_t arg3 __unused)302 hma_vmx_invept_xcall(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3 __unused)
303 {
304 int flag = (int)arg1;
305 uintptr_t eptp = (uintptr_t)arg2;
306
307 ASSERT(flag == INVEPT_SINGLE_CONTEXT || flag == INVEPT_ALL_CONTEXTS);
308
309 VERIFY0(hma_vmx_do_invept(flag, eptp));
310 return (0);
311 }
312
313 void
hma_vmx_invept_allcpus(uintptr_t eptp)314 hma_vmx_invept_allcpus(uintptr_t eptp)
315 {
316 int flag = -1;
317 cpuset_t set;
318
319 if ((hma_vmx_capabs & HVC_INVEPT_ONE) != 0) {
320 flag = INVEPT_SINGLE_CONTEXT;
321 } else if ((hma_vmx_capabs & HVC_INVEPT_ALL) != 0) {
322 flag = INVEPT_ALL_CONTEXTS;
323 eptp = 0;
324 } else {
325 return;
326 }
327
328 cpuset_zero(&set);
329 mutex_enter(&cpu_lock);
330
331 cpuset_or(&set, &cpu_active_set);
332 xc_call((xc_arg_t)flag, (xc_arg_t)eptp, 0, CPUSET2BV(set),
333 hma_vmx_invept_xcall);
334
335 mutex_exit(&cpu_lock);
336 }
337
338 static int
hma_vmx_cpu_vmxon(xc_arg_t arg1 __unused,xc_arg_t arg2 __unused,xc_arg_t arg3 __unused)339 hma_vmx_cpu_vmxon(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused,
340 xc_arg_t arg3 __unused)
341 {
342 uint64_t fctrl;
343 const processorid_t id = CPU->cpu_seqid;
344 hma_vmx_cpu_t *vmx_cpu = hma_vmx_cpu(id);
345
346 VERIFY(vmx_cpu->hvc_vmxon_page != NULL);
347 VERIFY(vmx_cpu->hvc_vmxon_pa != 0);
348
349 /*
350 * Ensure that the VMX support and lock bits are enabled in the
351 * feature-control MSR.
352 */
353 fctrl = rdmsr(MSR_IA32_FEAT_CTRL);
354 if ((fctrl & IA32_FEAT_CTRL_LOCK) == 0 ||
355 (fctrl & IA32_FEAT_CTRL_VMX_EN) == 0) {
356 fctrl = fctrl | IA32_FEAT_CTRL_VMX_EN | IA32_FEAT_CTRL_LOCK;
357 wrmsr(MSR_IA32_FEAT_CTRL, fctrl);
358 }
359
360 setcr4(getcr4() | CR4_VMXE);
361
362 if (hma_vmx_vmxon(vmx_cpu->hvc_vmxon_pa) == 0) {
363 hma_cpu[id].hc_status = HCS_READY;
364 } else {
365 hma_cpu[id].hc_status = HCS_ERROR;
366
367 /*
368 * If VMX has already been marked active and available for the
369 * system, then failure to perform VMXON on a newly-onlined CPU
370 * represents a fatal problem. Continuing on would mean
371 * failure for any hypervisor thread which landed here.
372 */
373 if (hma_vmx_ready) {
374 panic("VMXON failure after VMX marked ready");
375 }
376 }
377 return (0);
378 }
379
380 static int
hma_vmx_cpu_setup(cpu_setup_t what,int id,void * arg __unused)381 hma_vmx_cpu_setup(cpu_setup_t what, int id, void *arg __unused)
382 {
383 hma_vmx_cpu_t *vmx_cpu = hma_vmx_cpu(id);
384
385 ASSERT(MUTEX_HELD(&cpu_lock));
386 ASSERT(id >= 0 && id < NCPU);
387
388 if (what != CPU_ON) {
389 /*
390 * For the purposes of VMX setup, only the CPU_ON event is of
391 * interest. Letting VMX state linger on an offline CPU should
392 * not cause any harm.
393 *
394 * This logic assumes that any offlining activity is strictly
395 * administrative in nature and will not alter any existing
396 * configuration (such as %cr4 bits previously set).
397 */
398 return (0);
399 }
400
401 const hma_cpu_status_t status = hma_cpu[id].hc_status;
402 if (status == HCS_ERROR) {
403 return (-1);
404 }
405
406 /* Allocate the VMXON page for this CPU, if not already done */
407 if (vmx_cpu->hvc_vmxon_page == NULL) {
408 caddr_t va;
409 pfn_t pfn;
410
411 va = kmem_alloc(PAGESIZE, KM_SLEEP);
412 VERIFY0((uintptr_t)va & PAGEOFFSET);
413 vmx_cpu->hvc_vmxon_page = va;
414
415 /* Initialize the VMX revision field as expected */
416 bcopy(&hma_vmx_revision, va, sizeof (hma_vmx_revision));
417
418 /*
419 * Cache the physical address of the VMXON page rather than
420 * looking it up later when the potential blocking of
421 * hat_getpfnum would be less acceptable.
422 */
423 pfn = hat_getpfnum(kas.a_hat, va);
424 vmx_cpu->hvc_vmxon_pa = (pfn << PAGESHIFT);
425 } else {
426 VERIFY(vmx_cpu->hvc_vmxon_pa != 0);
427 }
428
429 if (status == HCS_UNINITIALIZED) {
430 cpuset_t set;
431
432 /* Activate VMX on this CPU */
433 cpuset_zero(&set);
434 cpuset_add(&set, id);
435 xc_call(0, 0, 0, CPUSET2BV(set), hma_vmx_cpu_vmxon);
436 } else {
437 VERIFY3U(status, ==, HCS_READY);
438
439 /*
440 * If an already-initialized CPU is going back online, perform
441 * an all-contexts invept to eliminate the possibility of
442 * cached EPT state causing issues.
443 */
444 if ((hma_vmx_capabs & HVC_INVEPT_ALL) != 0) {
445 cpuset_t set;
446
447 cpuset_zero(&set);
448 cpuset_add(&set, id);
449 xc_call((xc_arg_t)INVEPT_ALL_CONTEXTS, 0, 0,
450 CPUSET2BV(set), hma_vmx_invept_xcall);
451 }
452 }
453
454 return (hma_cpu[id].hc_status != HCS_READY);
455 }
456
457 /*
458 * Determining the availability of VM execution controls is somewhat different
459 * from conventional means, where one simply checks for asserted bits in the
460 * MSR value. Instead, these execution control MSRs are split into two halves:
461 * the lower 32-bits indicating capabilities which can be zeroed in the VMCS
462 * field and the upper 32-bits indicating capabilities which can be set to one.
463 *
464 * It is described in detail in Appendix A.3 of SDM volume 3.
465 */
466 #define VMX_CTL_ONE_SETTING(val, flag) \
467 (((val) & ((uint64_t)(flag) << 32)) != 0)
468
469 static const char *
hma_vmx_query_details(void)470 hma_vmx_query_details(void)
471 {
472 boolean_t query_true_ctl = B_FALSE;
473 uint64_t msr;
474
475 /* The basic INS/OUTS functionality is cited as a necessary prereq */
476 msr = rdmsr(MSR_IA32_VMX_BASIC);
477 if ((msr & IA32_VMX_BASIC_INS_OUTS) == 0) {
478 return ("VMX does not support INS/OUTS");
479 }
480
481 /* Record the VMX revision for later VMXON usage */
482 hma_vmx_revision = (uint32_t)msr;
483
484 /*
485 * Bit 55 in the VMX_BASIC MSR determines how VMX control information
486 * can be queried.
487 */
488 query_true_ctl = (msr & IA32_VMX_BASIC_TRUE_CTRLS) != 0;
489
490 /* Check for EPT and VPID support */
491 msr = rdmsr(query_true_ctl ?
492 MSR_IA32_VMX_TRUE_PROCBASED_CTLS : MSR_IA32_VMX_PROCBASED_CTLS);
493 if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED_2ND_CTLS)) {
494 msr = rdmsr(MSR_IA32_VMX_PROCBASED2_CTLS);
495 if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED2_EPT)) {
496 hma_vmx_capabs |= HVC_EPT;
497 }
498 if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED2_VPID)) {
499 hma_vmx_capabs |= HVC_VPID;
500 }
501 }
502
503 /* Check for INVEPT support */
504 if ((hma_vmx_capabs & HVC_EPT) != 0) {
505 msr = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP);
506 if ((msr & IA32_VMX_EPT_VPID_INVEPT) != 0) {
507 if ((msr & IA32_VMX_EPT_VPID_INVEPT_SINGLE) != 0) {
508 hma_vmx_capabs |= HVC_INVEPT_ONE;
509 }
510 if ((msr & IA32_VMX_EPT_VPID_INVEPT_ALL) != 0) {
511 hma_vmx_capabs |= HVC_INVEPT_ALL;
512 }
513 }
514 }
515
516 return (NULL);
517 }
518
519 static int
hma_vmx_init(void)520 hma_vmx_init(void)
521 {
522 cpu_t *cp;
523 uint64_t msr;
524 int err = 0;
525 const char *msg = NULL;
526
527 if (!is_x86_feature(x86_featureset, X86FSET_VMX)) {
528 msg = "CPU does not support VMX";
529 goto bail;
530 }
531
532 /* Has the BIOS set the feature-control lock bit without VMX enabled? */
533 msr = rdmsr(MSR_IA32_FEAT_CTRL);
534 if ((msr & IA32_FEAT_CTRL_LOCK) != 0 &&
535 (msr & IA32_FEAT_CTRL_VMX_EN) == 0) {
536 msg = "VMX support disabled by BIOS";
537 goto bail;
538 }
539
540 msg = hma_vmx_query_details();
541 if (msg != NULL) {
542 goto bail;
543 }
544
545 mutex_enter(&cpu_lock);
546 /* Perform VMX configuration for already-online CPUs. */
547 cp = cpu_active;
548 do {
549 err = hma_vmx_cpu_setup(CPU_ON, cp->cpu_seqid, NULL);
550 if (err != 0) {
551 msg = "failure during VMXON setup";
552 mutex_exit(&cpu_lock);
553 goto bail;
554 }
555 } while ((cp = cp->cpu_next_onln) != cpu_active);
556
557 /*
558 * Register callback for later-onlined CPUs and perform other remaining
559 * resource allocation.
560 */
561 register_cpu_setup_func(hma_vmx_cpu_setup, NULL);
562 mutex_exit(&cpu_lock);
563
564 hma_vmx_vpid = id_space_create("hma_vmx_vpid", HMA_VPID_RESERVED + 1,
565 UINT16_MAX);
566 hma_vmx_ready = B_TRUE;
567
568 return (0);
569
570 bail:
571 hma_vmx_error = msg;
572 cmn_err(CE_NOTE, "!hma_vmx_init: %s", msg);
573 return (-1);
574 }
575
576 #define VMCB_FLUSH_NOTHING 0x0
577 #define VMCB_FLUSH_ALL 0x1
578 #define VMCB_FLUSH_ASID 0x3
579
580 void
hma_svm_asid_init(hma_svm_asid_t * vcp)581 hma_svm_asid_init(hma_svm_asid_t *vcp)
582 {
583 /*
584 * Initialize the generation to 0, forcing an ASID allocation on first
585 * entry. Leave the ASID at 0, so if the host forgoes the call to
586 * hma_svm_asid_update(), SVM will bail on the invalid vcpu state.
587 */
588 vcp->hsa_gen = 0;
589 vcp->hsa_asid = 0;
590 }
591
592 uint8_t
hma_svm_asid_update(hma_svm_asid_t * vcp,boolean_t flush_by_asid,boolean_t npt_flush)593 hma_svm_asid_update(hma_svm_asid_t *vcp, boolean_t flush_by_asid,
594 boolean_t npt_flush)
595 {
596 /*
597 * Most ASID resource updates are expected to be performed as part of
598 * VMM entry into guest context, where interrupts would be disabled for
599 * the sake of state consistency.
600 *
601 * We demand this be the case, even though other situations which might
602 * incur an ASID update, such as userspace manipulation of guest vCPU
603 * state, may not require such consistency.
604 */
605 ASSERT(!interrupts_enabled());
606
607 /*
608 * If NPT changes dictate a TLB flush and by-ASID flushing is not
609 * supported/used, force a fresh ASID allocation.
610 */
611 if (npt_flush && !flush_by_asid) {
612 vcp->hsa_gen = 0;
613 }
614
615 hma_svm_asid_t *hcp = &(hma_svm_cpu(CPU->cpu_seqid)->hsc_asid);
616 if (vcp->hsa_gen != hcp->hsa_gen) {
617 hcp->hsa_asid++;
618
619 if (hcp->hsa_asid >= hma_svm_max_asid) {
620 /* Keep the ASID properly constrained */
621 hcp->hsa_asid = 1;
622 hcp->hsa_gen++;
623 if (hcp->hsa_gen == 0) {
624 /*
625 * Stay clear of the '0' sentinel value for
626 * generation, if wrapping around.
627 */
628 hcp->hsa_gen = 1;
629 }
630 }
631 vcp->hsa_gen = hcp->hsa_gen;
632 vcp->hsa_asid = hcp->hsa_asid;
633
634 ASSERT(vcp->hsa_asid != 0);
635 ASSERT3U(vcp->hsa_asid, <, hma_svm_max_asid);
636
637 if (flush_by_asid) {
638 return (VMCB_FLUSH_ASID);
639 } else {
640 return (VMCB_FLUSH_ALL);
641 }
642 } else if (npt_flush) {
643 ASSERT(flush_by_asid);
644 return (VMCB_FLUSH_ASID);
645 }
646
647 return (VMCB_FLUSH_NOTHING);
648 }
649
650 void
hma_svm_gif_disable(void)651 hma_svm_gif_disable(void)
652 {
653 /*
654 * Clear the GIF (masking interrupts) first, so the subsequent
655 * housekeeping can be done under its protection.
656 */
657 __asm__ __volatile__("clgi");
658
659 hma_svm_cpu_t *svm_cpu = hma_svm_cpu(CPU->cpu_seqid);
660 const uint_t old_gif = atomic_swap_uint(&svm_cpu->hsc_gif_disabled, 1);
661
662 if (old_gif != 0) {
663 panic("GIF disable is set when expected to be clear");
664 }
665 }
666
667 void
hma_svm_gif_enable(void)668 hma_svm_gif_enable(void)
669 {
670 hma_svm_cpu_t *svm_cpu = hma_svm_cpu(CPU->cpu_seqid);
671 const uint_t old_gif = atomic_swap_uint(&svm_cpu->hsc_gif_disabled, 0);
672
673 if (old_gif == 0) {
674 panic("GIF disable is clear when expected to be set");
675 }
676
677 /*
678 * Set the GIF last (un-masking interrupts) last, so the housekeeping
679 * will have been completed under its protection.
680 */
681 __asm__ __volatile__("stgi");
682 }
683
684 boolean_t
hma_svm_gif_is_disabled(void)685 hma_svm_gif_is_disabled(void)
686 {
687 hma_svm_cpu_t *svm_cpu = hma_svm_cpu(CPU->cpu_seqid);
688
689 /*
690 * At the time of this writing, there exists no mechanism by which the
691 * state of the GIF on a CPU can be directly queried. Rather than
692 * attempting an indirect means of checking its state, we track it
693 * manually through the HMA disable/enable functions.
694 */
695 return (svm_cpu->hsc_gif_disabled != 0);
696 }
697
698 #define EVTSEL_EN(evt) (((evt) & AMD_PERF_EVTSEL_CTR_EN) != 0)
699 #define CPC_BASE_REGS 4
700 #define CPC_EXTD_REGS 6
701 #define MSR_CPC_EXTD_EVTSEL(idx) (MSR_AMD_F15H_PERF_EVTSEL0 + (idx * 2))
702 #define MSR_CPC_EXTD_CTR(idx) (MSR_AMD_F15H_PERF_CTR0 + (idx * 2))
703
704 /*
705 * AMD CPU Performance Counter Support
706 *
707 * This provides a means of safely saving/loading host CPC state, along with
708 * loading/saving guest CPC state upon guest entry/exit (respectively).
709 * Currently, this only supports the 6 "extended" performance counters
710 * (in MSRs C0010200h - C001020bh). It pays no head to any other CPC state such
711 * as the Northbridge counters or PerfMonV2 registers.
712 */
713
714 hma_svm_cpc_res_t
hma_svm_cpc_enter(struct hma_svm_cpc_state * cpc_state)715 hma_svm_cpc_enter(struct hma_svm_cpc_state *cpc_state)
716 {
717 hma_svm_cpu_t *svm_cpu = hma_svm_cpu(CPU->cpu_seqid);
718
719 ASSERT(!interrupts_enabled());
720
721 svm_cpu->hsc_cpc_saved_flags = HCF_DISABLED;
722
723 const hma_cpc_flags_t req_flags =
724 cpc_state->hscs_flags & hma_svm_cpc_allowed;
725 if (req_flags == HCF_DISABLED) {
726 return (HSCR_EMPTY);
727 }
728
729 /* Extended regs should not be enabled without base */
730 IMPLY((req_flags & HCF_EN_EXTD) != 0, (req_flags & HCF_EN_BASE) != 0);
731
732 const uint_t max_guest_reg =
733 (req_flags & HCF_EN_EXTD) != 0 ? CPC_EXTD_REGS : CPC_BASE_REGS;
734 uint_t guest_active = 0;
735 for (uint_t i = 0; i < max_guest_reg; i++) {
736 if (EVTSEL_EN(cpc_state->hscs_regs[i].hc_evtsel)) {
737 guest_active++;
738 }
739 }
740
741 /*
742 * Guest is not currently measuring with any of the CPCs, so leave any
743 * host counters in place.
744 */
745 if (guest_active == 0) {
746 return (HSCR_EMPTY);
747 }
748
749 /*
750 * Read (and save) the host evtsel values, counting the number of
751 * registers in active use
752 */
753 uint_t host_active = 0;
754 for (uint_t i = 0; i < CPC_EXTD_REGS; i++) {
755 const uint64_t evtsel = rdmsr(MSR_CPC_EXTD_EVTSEL(i));
756
757 svm_cpu->hsc_cpc_host_regs[i].hc_evtsel = evtsel;
758 if (EVTSEL_EN(evtsel)) {
759 host_active++;
760 }
761 }
762
763 if (host_active != 0) {
764 if (hma_cpc_priority == HCP_HOST_WINS) {
765 /*
766 * Host has priority access to the perf counters over
767 * the guest, so just leave everything in place.
768 */
769 DTRACE_PROBE2(hma_svm__guest_deferred,
770 processorid_t, CPU->cpu_seqid,
771 uint_t, guest_active);
772 return (HSCR_EMPTY);
773 }
774
775 DTRACE_PROBE2(hma_svm__host_deferred,
776 processorid_t, CPU->cpu_seqid, uint_t, host_active);
777
778 /*
779 * Disable any active host counters, trying to do so in as
780 * consistent a manner as possible.
781 */
782 for (uint_t i = 0; i < CPC_EXTD_REGS; i++) {
783 const uint64_t evtsel =
784 svm_cpu->hsc_cpc_host_regs[i].hc_evtsel;
785 wrmsr(MSR_CPC_EXTD_EVTSEL(i),
786 evtsel & ~AMD_PERF_EVTSEL_CTR_EN);
787 }
788 }
789
790 /*
791 * With any active host counters stopped from collecting new events,
792 * save the counter values themselves before loading guest state.
793 */
794 for (uint_t i = 0; i < CPC_EXTD_REGS; i++) {
795 svm_cpu->hsc_cpc_host_regs[i].hc_ctr =
796 rdmsr(MSR_CPC_EXTD_CTR(i));
797 }
798
799 /*
800 * Now load the guest state, fixing it up with the flag necessary to
801 * collect events only while in guest context.
802 */
803 for (uint_t i = 0; i < max_guest_reg; i++) {
804 uint64_t evtsel = cpc_state->hscs_regs[i].hc_evtsel;
805
806 /*
807 * Clear any existing HG flags, as well as any request for
808 * interrupt enable. (Trapping the interrupt from guest counters
809 * is not presently supported.)
810 */
811 evtsel &= ~(AMD_PERF_EVTSEL_HG_MASK | AMD_PERF_EVTSEL_INT_EN);
812 /* And indicate guest-only event tracking */
813 evtsel |= AMD_PERF_EVTSEL_HG_GUEST;
814
815 wrmsr(MSR_CPC_EXTD_EVTSEL(i), evtsel);
816 wrmsr(MSR_CPC_EXTD_CTR(i), cpc_state->hscs_regs[i].hc_ctr);
817 }
818
819 svm_cpu->hsc_cpc_saved_flags = req_flags;
820 return (HSCR_ACCESS_RDPMC | HSCR_ACCESS_CTR_MSR);
821 }
822
823 void
hma_svm_cpc_exit(struct hma_svm_cpc_state * cpc_state)824 hma_svm_cpc_exit(struct hma_svm_cpc_state *cpc_state)
825 {
826 ASSERT(!interrupts_enabled());
827
828 hma_svm_cpu_t *svm_cpu = hma_svm_cpu(CPU->cpu_seqid);
829
830 const hma_cpc_flags_t saved_flags = svm_cpu->hsc_cpc_saved_flags;
831 if (saved_flags == HCF_DISABLED) {
832 return;
833 }
834
835 /* Save the guest counter values. */
836 const uint_t max_guest_reg =
837 (saved_flags & HCF_EN_EXTD) != 0 ? CPC_EXTD_REGS : CPC_BASE_REGS;
838 for (uint_t i = 0; i < max_guest_reg; i++) {
839 cpc_state->hscs_regs[i].hc_ctr = rdmsr(MSR_CPC_EXTD_CTR(i));
840 }
841
842 /*
843 * Load the host values back, once again taking care to toggle the
844 * counter enable state as a separate step in an attempt to keep
845 * readings as consistent as possible
846 */
847 uint_t host_active = 0;
848 for (uint_t i = 0; i < CPC_EXTD_REGS; i++) {
849 const uint64_t evtsel = svm_cpu->hsc_cpc_host_regs[i].hc_evtsel;
850
851 if (EVTSEL_EN(evtsel)) {
852 host_active++;
853 }
854 wrmsr(MSR_CPC_EXTD_EVTSEL(i), evtsel & ~AMD_PERF_EVTSEL_CTR_EN);
855 wrmsr(MSR_CPC_EXTD_CTR(i),
856 svm_cpu->hsc_cpc_host_regs[i].hc_ctr);
857 }
858
859 /*
860 * Allow any enabled host counters to collect events, now that all of
861 * the other state is loaded.
862 */
863 if (host_active != 0) {
864 for (uint_t i = 0; i < CPC_EXTD_REGS; i++) {
865 wrmsr(MSR_CPC_EXTD_EVTSEL(i),
866 svm_cpu->hsc_cpc_host_regs[i].hc_evtsel);
867 }
868 }
869 }
870
871 static int
hma_svm_cpu_activate(xc_arg_t arg1 __unused,xc_arg_t arg2 __unused,xc_arg_t arg3 __unused)872 hma_svm_cpu_activate(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused,
873 xc_arg_t arg3 __unused)
874 {
875 const processorid_t id = CPU->cpu_seqid;
876 const uintptr_t hsave_pa = hma_svm_cpu(id)->hsc_hsave_pa;
877 uint64_t efer;
878
879 VERIFY(hsave_pa != 0);
880
881 /* Enable SVM via EFER */
882 efer = rdmsr(MSR_AMD_EFER);
883 efer |= AMD_EFER_SVME;
884 wrmsr(MSR_AMD_EFER, efer);
885
886 /* Setup hsave area */
887 wrmsr(MSR_AMD_VM_HSAVE_PA, hsave_pa);
888
889 hma_cpu[id].hc_status = HCS_READY;
890 return (0);
891 }
892
893 static int
hma_svm_cpu_setup(cpu_setup_t what,int id,void * arg __unused)894 hma_svm_cpu_setup(cpu_setup_t what, int id, void *arg __unused)
895 {
896 hma_svm_cpu_t *svm_cpu = hma_svm_cpu(id);
897
898 ASSERT(MUTEX_HELD(&cpu_lock));
899 ASSERT(id >= 0 && id < NCPU);
900
901 switch (what) {
902 case CPU_CONFIG:
903 case CPU_ON:
904 case CPU_INIT:
905 break;
906 default:
907 /*
908 * Other events, such as CPU offlining, are of no interest.
909 * Letting the SVM state linger should not cause any harm.
910 *
911 * This logic assumes that any offlining activity is strictly
912 * administrative in nature and will not alter any existing
913 * configuration (such as EFER bits previously set).
914 */
915 return (0);
916 }
917
918 /* Perform initialization if it has not been previously attempted. */
919 if (hma_cpu[id].hc_status != HCS_UNINITIALIZED) {
920 return ((hma_cpu[id].hc_status == HCS_READY) ? 0 : -1);
921 }
922
923 /* Allocate the hsave page for this CPU */
924 if (svm_cpu->hsc_hsave_page == NULL) {
925 caddr_t va;
926 pfn_t pfn;
927
928 va = kmem_alloc(PAGESIZE, KM_SLEEP);
929 VERIFY0((uintptr_t)va & PAGEOFFSET);
930 svm_cpu->hsc_hsave_page = va;
931
932 /*
933 * Cache the physical address of the hsave page rather than
934 * looking it up later when the potential blocking of
935 * hat_getpfnum would be less acceptable.
936 */
937 pfn = hat_getpfnum(kas.a_hat, va);
938 svm_cpu->hsc_hsave_pa = (pfn << PAGESHIFT);
939 } else {
940 VERIFY(svm_cpu->hsc_hsave_pa != 0);
941 }
942
943 kpreempt_disable();
944 if (CPU->cpu_seqid == id) {
945 /* Perform svm setup directly if this CPU is the target */
946 (void) hma_svm_cpu_activate(0, 0, 0);
947 kpreempt_enable();
948 } else {
949 cpuset_t set;
950
951 /* Use a cross-call if a remote CPU is the target */
952 kpreempt_enable();
953 cpuset_zero(&set);
954 cpuset_add(&set, id);
955 xc_call(0, 0, 0, CPUSET2BV(set), hma_svm_cpu_activate);
956 }
957
958 return (hma_cpu[id].hc_status != HCS_READY);
959 }
960
961 static int
hma_svm_init(void)962 hma_svm_init(void)
963 {
964 uint64_t msr;
965 const char *msg = NULL;
966 struct cpuid_regs regs;
967 cpu_t *cp;
968
969 if (!is_x86_feature(x86_featureset, X86FSET_SVM)) {
970 msg = "CPU does not support SVM";
971 goto bail;
972 }
973
974 msr = rdmsr(MSR_AMD_VM_CR);
975 if ((msr & AMD_VM_CR_SVMDIS) != 0) {
976 msg = "SVM disabled by BIOS";
977 goto bail;
978 }
979
980 regs.cp_eax = 0x8000000a;
981 (void) cpuid_insn(NULL, ®s);
982 const uint32_t nasid = regs.cp_ebx;
983 const uint32_t feat = regs.cp_edx;
984
985 if (nasid == 0) {
986 msg = "Not enough ASIDs for guests";
987 goto bail;
988 }
989 if ((feat & CPUID_AMD_EDX_NESTED_PAGING) == 0) {
990 msg = "CPU does not support nested paging";
991 goto bail;
992 }
993 if ((feat & CPUID_AMD_EDX_NRIPS) == 0) {
994 msg = "CPU does not support NRIP save";
995 goto bail;
996 }
997
998 hma_svm_features = feat;
999 hma_svm_max_asid = nasid;
1000
1001 mutex_enter(&cpu_lock);
1002 /* Perform SVM configuration for already-online CPUs. */
1003 cp = cpu_active;
1004 do {
1005 int err = hma_svm_cpu_setup(CPU_ON, cp->cpu_seqid, NULL);
1006 if (err != 0) {
1007 msg = "failure during SVM setup";
1008 mutex_exit(&cpu_lock);
1009 goto bail;
1010 }
1011 } while ((cp = cp->cpu_next_onln) != cpu_active);
1012
1013 /*
1014 * Register callback for later-onlined CPUs and perform other remaining
1015 * resource allocation.
1016 */
1017 register_cpu_setup_func(hma_svm_cpu_setup, NULL);
1018 mutex_exit(&cpu_lock);
1019
1020 /* Initialize per-CPU ASID state. */
1021 for (uint_t i = 0; i < NCPU; i++) {
1022 /*
1023 * Skip past sentinel 0 value for generation. Doing so for
1024 * ASID is unneeded, since it will be incremented during the
1025 * first allocation.
1026 */
1027 hma_svm_asid_t *cpu_asid = &hma_svm_cpu(i)->hsc_asid;
1028 cpu_asid->hsa_gen = 1;
1029 cpu_asid->hsa_asid = 0;
1030 }
1031
1032 /*
1033 * For now, only expose performance counter support if the host supports
1034 * "extended" counters. This makes MSR access more consistent for logic
1035 * handling that state.
1036 */
1037 if (is_x86_feature(x86_featureset, X86FSET_AMD_PCEC)) {
1038 hma_svm_cpc_allowed = HCF_EN_BASE | HCF_EN_EXTD;
1039 }
1040
1041 hma_svm_ready = B_TRUE;
1042 return (0);
1043
1044 bail:
1045 hma_svm_error = msg;
1046 cmn_err(CE_NOTE, "!hma_svm_init: %s", msg);
1047 return (-1);
1048 }
1049