xref: /freebsd/sys/x86/x86/cpu_machdep.c (revision 459dc427873c9a294387ec74a96e6f7824de7435)
1 /*-
2  * Copyright (c) 2003 Peter Wemm.
3  * Copyright (c) 1992 Terrence R. Lambert.
4  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * William Jolitz.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 #include <sys/cdefs.h>
40 #include "opt_acpi.h"
41 #include "opt_atpic.h"
42 #include "opt_cpu.h"
43 #include "opt_ddb.h"
44 #include "opt_inet.h"
45 #include "opt_isa.h"
46 #include "opt_kdb.h"
47 #include "opt_kstack_pages.h"
48 #include "opt_maxmem.h"
49 #include "opt_platform.h"
50 #include "opt_sched.h"
51 #ifdef __i386__
52 #include "opt_apic.h"
53 #endif
54 
55 #include <sys/param.h>
56 #include <sys/proc.h>
57 #include <sys/systm.h>
58 #include <sys/bus.h>
59 #include <sys/cpu.h>
60 #include <sys/domainset.h>
61 #include <sys/kdb.h>
62 #include <sys/kernel.h>
63 #include <sys/ktr.h>
64 #include <sys/lock.h>
65 #include <sys/malloc.h>
66 #include <sys/mutex.h>
67 #include <sys/pcpu.h>
68 #include <sys/rwlock.h>
69 #include <sys/sched.h>
70 #include <sys/smp.h>
71 #include <sys/sysctl.h>
72 
73 #include <machine/clock.h>
74 #include <machine/cpu.h>
75 #include <machine/cpufunc.h>
76 #include <machine/cputypes.h>
77 #include <machine/specialreg.h>
78 #include <machine/md_var.h>
79 #include <machine/trap.h>
80 #include <machine/tss.h>
81 #ifdef SMP
82 #include <machine/smp.h>
83 #endif
84 #ifdef CPU_ELAN
85 #include <machine/elan_mmcr.h>
86 #endif
87 #include <x86/acpica_machdep.h>
88 #include <x86/ifunc.h>
89 
90 #include <vm/vm.h>
91 #include <vm/vm_extern.h>
92 #include <vm/vm_kern.h>
93 #include <vm/vm_page.h>
94 #include <vm/vm_map.h>
95 #include <vm/vm_object.h>
96 #include <vm/vm_pager.h>
97 #include <vm/vm_param.h>
98 
99 #include <isa/isareg.h>
100 
101 #include <contrib/dev/acpica/include/acpi.h>
102 
103 #define	STATE_RUNNING	0x0
104 #define	STATE_MWAIT	0x1
105 #define	STATE_SLEEPING	0x2
106 
107 #ifdef SMP
108 static u_int	cpu_reset_proxyid;
109 static volatile u_int	cpu_reset_proxy_active;
110 #endif
111 
112 char bootmethod[16];
113 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
114     "System firmware boot method");
115 
116 struct msr_op_arg {
117 	u_int msr;
118 	int op;
119 	uint64_t arg1;
120 	uint64_t *res;
121 };
122 
123 static void
x86_msr_op_one(void * argp)124 x86_msr_op_one(void *argp)
125 {
126 	struct msr_op_arg *a;
127 	uint64_t v;
128 
129 	a = argp;
130 	switch (a->op) {
131 	case MSR_OP_ANDNOT:
132 		v = rdmsr(a->msr);
133 		v &= ~a->arg1;
134 		wrmsr(a->msr, v);
135 		break;
136 	case MSR_OP_OR:
137 		v = rdmsr(a->msr);
138 		v |= a->arg1;
139 		wrmsr(a->msr, v);
140 		break;
141 	case MSR_OP_WRITE:
142 		wrmsr(a->msr, a->arg1);
143 		break;
144 	case MSR_OP_READ:
145 		v = rdmsr(a->msr);
146 		*a->res = v;
147 		break;
148 	}
149 }
150 
151 #define	MSR_OP_EXMODE_MASK	0xf0000000
152 #define	MSR_OP_OP_MASK		0x000000ff
153 #define	MSR_OP_GET_CPUID(x)	(((x) & ~MSR_OP_EXMODE_MASK) >> 8)
154 
155 void
x86_msr_op(u_int msr,u_int op,uint64_t arg1,uint64_t * res)156 x86_msr_op(u_int msr, u_int op, uint64_t arg1, uint64_t *res)
157 {
158 	struct thread *td;
159 	struct msr_op_arg a;
160 	cpuset_t set;
161 	u_int exmode;
162 	int bound_cpu, cpu, i, is_bound;
163 
164 	a.op = op & MSR_OP_OP_MASK;
165 	MPASS(a.op == MSR_OP_ANDNOT || a.op == MSR_OP_OR ||
166 	    a.op == MSR_OP_WRITE || a.op == MSR_OP_READ);
167 	exmode = op & MSR_OP_EXMODE_MASK;
168 	MPASS(exmode == MSR_OP_LOCAL || exmode == MSR_OP_SCHED_ALL ||
169 	    exmode == MSR_OP_SCHED_ONE || exmode == MSR_OP_RENDEZVOUS_ALL ||
170 	    exmode == MSR_OP_RENDEZVOUS_ONE);
171 	a.msr = msr;
172 	a.arg1 = arg1;
173 	a.res = res;
174 	switch (exmode) {
175 	case MSR_OP_LOCAL:
176 		x86_msr_op_one(&a);
177 		break;
178 	case MSR_OP_SCHED_ALL:
179 		td = curthread;
180 		thread_lock(td);
181 		is_bound = sched_is_bound(td);
182 		bound_cpu = td->td_oncpu;
183 		CPU_FOREACH(i) {
184 			sched_bind(td, i);
185 			x86_msr_op_one(&a);
186 		}
187 		if (is_bound)
188 			sched_bind(td, bound_cpu);
189 		else
190 			sched_unbind(td);
191 		thread_unlock(td);
192 		break;
193 	case MSR_OP_SCHED_ONE:
194 		td = curthread;
195 		cpu = MSR_OP_GET_CPUID(op);
196 		thread_lock(td);
197 		is_bound = sched_is_bound(td);
198 		bound_cpu = td->td_oncpu;
199 		if (!is_bound || bound_cpu != cpu)
200 			sched_bind(td, cpu);
201 		x86_msr_op_one(&a);
202 		if (is_bound) {
203 			if (bound_cpu != cpu)
204 				sched_bind(td, bound_cpu);
205 		} else {
206 			sched_unbind(td);
207 		}
208 		thread_unlock(td);
209 		break;
210 	case MSR_OP_RENDEZVOUS_ALL:
211 		smp_rendezvous(smp_no_rendezvous_barrier, x86_msr_op_one,
212 		    smp_no_rendezvous_barrier, &a);
213 		break;
214 	case MSR_OP_RENDEZVOUS_ONE:
215 		cpu = MSR_OP_GET_CPUID(op);
216 		CPU_SETOF(cpu, &set);
217 		smp_rendezvous_cpus(set, smp_no_rendezvous_barrier,
218 		    x86_msr_op_one, smp_no_rendezvous_barrier, &a);
219 		break;
220 	}
221 }
222 
223 /*
224  * Automatically initialized per CPU errata in cpu_idle_tun below.
225  */
226 bool mwait_cpustop_broken = false;
227 SYSCTL_BOOL(_machdep, OID_AUTO, mwait_cpustop_broken, CTLFLAG_RDTUN,
228     &mwait_cpustop_broken, 0,
229     "Can not reliably wake MONITOR/MWAIT cpus without interrupts");
230 
231 /*
232  * Flush the D-cache for non-DMA I/O so that the I-cache can
233  * be made coherent later.
234  */
235 void
cpu_flush_dcache(void * ptr,size_t len)236 cpu_flush_dcache(void *ptr, size_t len)
237 {
238 	/* Not applicable */
239 }
240 
241 void
acpi_cpu_c1(void)242 acpi_cpu_c1(void)
243 {
244 
245 	__asm __volatile("sti; hlt");
246 }
247 
248 /*
249  * Use mwait to pause execution while waiting for an interrupt or
250  * another thread to signal that there is more work.
251  *
252  * NOTE: Interrupts will cause a wakeup; however, this function does
253  * not enable interrupt handling. The caller is responsible to enable
254  * interrupts.
255  */
256 void
acpi_cpu_idle_mwait(uint32_t mwait_hint)257 acpi_cpu_idle_mwait(uint32_t mwait_hint)
258 {
259 	int *state;
260 	uint64_t v;
261 
262 	/*
263 	 * A comment in Linux patch claims that 'CPUs run faster with
264 	 * speculation protection disabled. All CPU threads in a core
265 	 * must disable speculation protection for it to be
266 	 * disabled. Disable it while we are idle so the other
267 	 * hyperthread can run fast.'
268 	 *
269 	 * XXXKIB.  Software coordination mode should be supported,
270 	 * but all Intel CPUs provide hardware coordination.
271 	 */
272 
273 	state = &PCPU_PTR(monitorbuf)->idle_state;
274 	KASSERT(atomic_load_int(state) == STATE_SLEEPING,
275 	    ("cpu_mwait_cx: wrong monitorbuf state"));
276 	atomic_store_int(state, STATE_MWAIT);
277 	if (PCPU_GET(ibpb_set) || hw_ssb_active) {
278 		v = rdmsr(MSR_IA32_SPEC_CTRL);
279 		wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS |
280 		    IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD));
281 	} else {
282 		v = 0;
283 	}
284 	cpu_monitor(state, 0, 0);
285 	if (atomic_load_int(state) == STATE_MWAIT)
286 		cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
287 
288 	/*
289 	 * SSB cannot be disabled while we sleep, or rather, if it was
290 	 * disabled, the sysctl thread will bind to our cpu to tweak
291 	 * MSR.
292 	 */
293 	if (v != 0)
294 		wrmsr(MSR_IA32_SPEC_CTRL, v);
295 
296 	/*
297 	 * We should exit on any event that interrupts mwait, because
298 	 * that event might be a wanted interrupt.
299 	 */
300 	atomic_store_int(state, STATE_RUNNING);
301 }
302 
303 /* Get current clock frequency for the given cpu id. */
304 int
cpu_est_clockrate(int cpu_id,uint64_t * rate)305 cpu_est_clockrate(int cpu_id, uint64_t *rate)
306 {
307 	uint64_t tsc1, tsc2;
308 	uint64_t acnt, mcnt, perf;
309 	register_t reg;
310 
311 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
312 		return (EINVAL);
313 #ifdef __i386__
314 	if ((cpu_feature & CPUID_TSC) == 0)
315 		return (EOPNOTSUPP);
316 #endif
317 
318 	/*
319 	 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
320 	 * DELAY(9) based logic fails.
321 	 */
322 	if (tsc_is_invariant && !tsc_perf_stat)
323 		return (EOPNOTSUPP);
324 
325 #ifdef SMP
326 	if (smp_cpus > 1) {
327 		/* Schedule ourselves on the indicated cpu. */
328 		thread_lock(curthread);
329 		sched_bind(curthread, cpu_id);
330 		thread_unlock(curthread);
331 	}
332 #endif
333 
334 	/* Calibrate by measuring a short delay. */
335 	reg = intr_disable();
336 	if (tsc_is_invariant) {
337 		wrmsr(MSR_MPERF, 0);
338 		wrmsr(MSR_APERF, 0);
339 		tsc1 = rdtsc();
340 		DELAY(1000);
341 		mcnt = rdmsr(MSR_MPERF);
342 		acnt = rdmsr(MSR_APERF);
343 		tsc2 = rdtsc();
344 		intr_restore(reg);
345 		perf = 1000 * acnt / mcnt;
346 		*rate = (tsc2 - tsc1) * perf;
347 	} else {
348 		tsc1 = rdtsc();
349 		DELAY(1000);
350 		tsc2 = rdtsc();
351 		intr_restore(reg);
352 		*rate = (tsc2 - tsc1) * 1000;
353 	}
354 
355 #ifdef SMP
356 	if (smp_cpus > 1) {
357 		thread_lock(curthread);
358 		sched_unbind(curthread);
359 		thread_unlock(curthread);
360 	}
361 #endif
362 
363 	return (0);
364 }
365 
366 /*
367  * Shutdown the CPU as much as possible
368  */
369 void
cpu_halt(void)370 cpu_halt(void)
371 {
372 	for (;;)
373 		halt();
374 }
375 
376 static void
cpu_reset_real(void)377 cpu_reset_real(void)
378 {
379 	struct region_descriptor null_idt;
380 	int b;
381 
382 	disable_intr();
383 #ifdef CPU_ELAN
384 	if (elan_mmcr != NULL)
385 		elan_mmcr->RESCFG = 1;
386 #endif
387 #ifdef __i386__
388 	if (cpu == CPU_GEODE1100) {
389 		/* Attempt Geode's own reset */
390 		outl(0xcf8, 0x80009044ul);
391 		outl(0xcfc, 0xf);
392 	}
393 #endif
394 #if !defined(BROKEN_KEYBOARD_RESET)
395 	/*
396 	 * Attempt to do a CPU reset via the keyboard controller,
397 	 * do not turn off GateA20, as any machine that fails
398 	 * to do the reset here would then end up in no man's land.
399 	 */
400 	outb(IO_KBD + 4, 0xFE);
401 	DELAY(500000);	/* wait 0.5 sec to see if that did it */
402 #endif
403 
404 	/*
405 	 * Attempt to force a reset via the Reset Control register at
406 	 * I/O port 0xcf9.  Bit 2 forces a system reset when it
407 	 * transitions from 0 to 1.  Bit 1 selects the type of reset
408 	 * to attempt: 0 selects a "soft" reset, and 1 selects a
409 	 * "hard" reset.  We try a "hard" reset.  The first write sets
410 	 * bit 1 to select a "hard" reset and clears bit 2.  The
411 	 * second write forces a 0 -> 1 transition in bit 2 to trigger
412 	 * a reset.
413 	 */
414 	outb(0xcf9, 0x2);
415 	outb(0xcf9, 0x6);
416 	DELAY(500000);  /* wait 0.5 sec to see if that did it */
417 
418 	/*
419 	 * Attempt to force a reset via the Fast A20 and Init register
420 	 * at I/O port 0x92.  Bit 1 serves as an alternate A20 gate.
421 	 * Bit 0 asserts INIT# when set to 1.  We are careful to only
422 	 * preserve bit 1 while setting bit 0.  We also must clear bit
423 	 * 0 before setting it if it isn't already clear.
424 	 */
425 	b = inb(0x92);
426 	if (b != 0xff) {
427 		if ((b & 0x1) != 0)
428 			outb(0x92, b & 0xfe);
429 		outb(0x92, b | 0x1);
430 		DELAY(500000);  /* wait 0.5 sec to see if that did it */
431 	}
432 
433 	printf("No known reset method worked, attempting CPU shutdown\n");
434 	DELAY(1000000); /* wait 1 sec for printf to complete */
435 
436 	/* Wipe the IDT. */
437 	null_idt.rd_limit = 0;
438 	null_idt.rd_base = 0;
439 	lidt(&null_idt);
440 
441 	/* "good night, sweet prince .... <THUNK!>" */
442 	breakpoint();
443 
444 	/* NOTREACHED */
445 	while(1);
446 }
447 
448 #ifdef SMP
449 static void
cpu_reset_proxy(void)450 cpu_reset_proxy(void)
451 {
452 
453 	cpu_reset_proxy_active = 1;
454 	while (cpu_reset_proxy_active == 1)
455 		ia32_pause(); /* Wait for other cpu to see that we've started */
456 
457 	printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
458 	DELAY(1000000);
459 	cpu_reset_real();
460 }
461 #endif
462 
463 void
cpu_reset(void)464 cpu_reset(void)
465 {
466 #ifdef SMP
467 	struct monitorbuf *mb;
468 	cpuset_t map;
469 	u_int cnt;
470 
471 	if (smp_started) {
472 		map = all_cpus;
473 		CPU_CLR(PCPU_GET(cpuid), &map);
474 		CPU_ANDNOT(&map, &map, &stopped_cpus);
475 		if (!CPU_EMPTY(&map)) {
476 			printf("cpu_reset: Stopping other CPUs\n");
477 			stop_cpus(map);
478 		}
479 
480 		if (PCPU_GET(cpuid) != 0) {
481 			cpu_reset_proxyid = PCPU_GET(cpuid);
482 			cpustop_restartfunc = cpu_reset_proxy;
483 			cpu_reset_proxy_active = 0;
484 			printf("cpu_reset: Restarting BSP\n");
485 
486 			/* Restart CPU #0. */
487 			CPU_SETOF(0, &started_cpus);
488 			mb = &pcpu_find(0)->pc_monitorbuf;
489 			atomic_store_int(&mb->stop_state,
490 			    MONITOR_STOPSTATE_RUNNING);
491 
492 			cnt = 0;
493 			while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
494 				ia32_pause();
495 				cnt++;	/* Wait for BSP to announce restart */
496 			}
497 			if (cpu_reset_proxy_active == 0) {
498 				printf("cpu_reset: Failed to restart BSP\n");
499 			} else {
500 				cpu_reset_proxy_active = 2;
501 				while (1)
502 					ia32_pause();
503 				/* NOTREACHED */
504 			}
505 		}
506 	}
507 #endif
508 	cpu_reset_real();
509 	/* NOTREACHED */
510 }
511 
512 bool
cpu_mwait_usable(void)513 cpu_mwait_usable(void)
514 {
515 
516 	return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
517 	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
518 	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
519 }
520 
521 void (*cpu_idle_hook)(sbintime_t) = NULL;	/* ACPI idle hook. */
522 
523 int cpu_amdc1e_bug = 0;			/* AMD C1E APIC workaround required. */
524 
525 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
526 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
527     0, "Use MONITOR/MWAIT for short idle");
528 
529 static bool
cpu_idle_enter(int * statep,int newstate)530 cpu_idle_enter(int *statep, int newstate)
531 {
532 	KASSERT(atomic_load_int(statep) == STATE_RUNNING,
533 	    ("%s: state %d", __func__, atomic_load_int(statep)));
534 
535 	/*
536 	 * A fence is needed to prevent reordering of the load in
537 	 * sched_runnable() with this store to the idle state word.  Without it,
538 	 * cpu_idle_wakeup() can observe the state as STATE_RUNNING after having
539 	 * added load to the queue, and elide an IPI.  Then, sched_runnable()
540 	 * can observe tdq_load == 0, so the CPU ends up idling with pending
541 	 * work.  tdq_notify() similarly ensures that a prior update to tdq_load
542 	 * is visible before calling cpu_idle_wakeup().
543 	 */
544 	atomic_store_int(statep, newstate);
545 #if defined(SCHED_ULE) && defined(SMP)
546 	atomic_thread_fence_seq_cst();
547 #endif
548 
549 	/*
550 	 * Since we may be in a critical section from cpu_idle(), if
551 	 * an interrupt fires during that critical section we may have
552 	 * a pending preemption.  If the CPU halts, then that thread
553 	 * may not execute until a later interrupt awakens the CPU.
554 	 * To handle this race, check for a runnable thread after
555 	 * disabling interrupts and immediately return if one is
556 	 * found.  Also, we must absolutely guarentee that hlt is
557 	 * the next instruction after sti.  This ensures that any
558 	 * interrupt that fires after the call to disable_intr() will
559 	 * immediately awaken the CPU from hlt.  Finally, please note
560 	 * that on x86 this works fine because of interrupts enabled only
561 	 * after the instruction following sti takes place, while IF is set
562 	 * to 1 immediately, allowing hlt instruction to acknowledge the
563 	 * interrupt.
564 	 */
565 	disable_intr();
566 	if (sched_runnable()) {
567 		enable_intr();
568 		atomic_store_int(statep, STATE_RUNNING);
569 		return (false);
570 	} else {
571 		return (true);
572 	}
573 }
574 
575 static void
cpu_idle_exit(int * statep)576 cpu_idle_exit(int *statep)
577 {
578 	atomic_store_int(statep, STATE_RUNNING);
579 }
580 
581 static void
cpu_idle_acpi(sbintime_t sbt)582 cpu_idle_acpi(sbintime_t sbt)
583 {
584 	int *state;
585 
586 	state = &PCPU_PTR(monitorbuf)->idle_state;
587 	if (cpu_idle_enter(state, STATE_SLEEPING)) {
588 		if (cpu_idle_hook)
589 			cpu_idle_hook(sbt);
590 		else
591 			acpi_cpu_c1();
592 		cpu_idle_exit(state);
593 	}
594 }
595 
596 static void
cpu_idle_hlt(sbintime_t sbt)597 cpu_idle_hlt(sbintime_t sbt)
598 {
599 	int *state;
600 
601 	state = &PCPU_PTR(monitorbuf)->idle_state;
602 	if (cpu_idle_enter(state, STATE_SLEEPING)) {
603 		acpi_cpu_c1();
604 		atomic_store_int(state, STATE_RUNNING);
605 	}
606 }
607 
608 static void
cpu_idle_mwait(sbintime_t sbt)609 cpu_idle_mwait(sbintime_t sbt)
610 {
611 	int *state;
612 
613 	state = &PCPU_PTR(monitorbuf)->idle_state;
614 	if (cpu_idle_enter(state, STATE_MWAIT)) {
615 		cpu_monitor(state, 0, 0);
616 		if (atomic_load_int(state) == STATE_MWAIT)
617 			__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
618 		else
619 			enable_intr();
620 		cpu_idle_exit(state);
621 	}
622 }
623 
624 static void
cpu_idle_spin(sbintime_t sbt)625 cpu_idle_spin(sbintime_t sbt)
626 {
627 	int *state;
628 	int i;
629 
630 	state = &PCPU_PTR(monitorbuf)->idle_state;
631 	atomic_store_int(state, STATE_RUNNING);
632 
633 	/*
634 	 * The sched_runnable() call is racy but as long as there is
635 	 * a loop missing it one time will have just a little impact if any
636 	 * (and it is much better than missing the check at all).
637 	 */
638 	for (i = 0; i < 1000; i++) {
639 		if (sched_runnable())
640 			return;
641 		cpu_spinwait();
642 	}
643 }
644 
645 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
646 
647 void
cpu_idle(int busy)648 cpu_idle(int busy)
649 {
650 	uint64_t msr;
651 	sbintime_t sbt = -1;
652 
653 	CTR1(KTR_SPARE2, "cpu_idle(%d)", busy);
654 
655 	/* If we are busy - try to use fast methods. */
656 	if (busy) {
657 		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
658 			cpu_idle_mwait(busy);
659 			goto out;
660 		}
661 	}
662 
663 	/* If we have time - switch timers into idle mode. */
664 	if (!busy) {
665 		critical_enter();
666 		sbt = cpu_idleclock();
667 	}
668 
669 	/* Apply AMD APIC timer C1E workaround. */
670 	if (cpu_amdc1e_bug && cpu_disable_c3_sleep) {
671 		msr = rdmsr(MSR_AMDK8_IPM);
672 		if ((msr & (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)) != 0)
673 			wrmsr(MSR_AMDK8_IPM, msr & ~(AMDK8_SMIONCMPHALT |
674 			    AMDK8_C1EONCMPHALT));
675 	}
676 
677 	/* Call main idle method. */
678 	cpu_idle_fn(sbt);
679 
680 	/* Switch timers back into active mode. */
681 	if (!busy) {
682 		cpu_activeclock();
683 		critical_exit();
684 	}
685 out:
686 	CTR1(KTR_SPARE2, "cpu_idle(%d) done", busy);
687 }
688 
689 static int cpu_idle_apl31_workaround;
690 SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
691     &cpu_idle_apl31_workaround, 0,
692     "Apollo Lake APL31 MWAIT bug workaround");
693 
694 int
cpu_idle_wakeup(int cpu)695 cpu_idle_wakeup(int cpu)
696 {
697 	struct monitorbuf *mb;
698 	int *state;
699 
700 	mb = &pcpu_find(cpu)->pc_monitorbuf;
701 	state = &mb->idle_state;
702 	switch (atomic_load_int(state)) {
703 	case STATE_SLEEPING:
704 		return (0);
705 	case STATE_MWAIT:
706 		atomic_store_int(state, STATE_RUNNING);
707 		return (cpu_idle_apl31_workaround ? 0 : 1);
708 	case STATE_RUNNING:
709 		return (1);
710 	default:
711 		panic("bad monitor state");
712 		return (1);
713 	}
714 }
715 
716 /*
717  * Ordered by speed/power consumption.
718  */
719 static const struct {
720 	void	*id_fn;
721 	const char *id_name;
722 	int	id_cpuid2_flag;
723 } idle_tbl[] = {
724 	{ .id_fn = cpu_idle_spin, .id_name = "spin" },
725 	{ .id_fn = cpu_idle_mwait, .id_name = "mwait",
726 	    .id_cpuid2_flag = CPUID2_MON },
727 	{ .id_fn = cpu_idle_hlt, .id_name = "hlt" },
728 	{ .id_fn = cpu_idle_acpi, .id_name = "acpi" },
729 };
730 
731 static int
idle_sysctl_available(SYSCTL_HANDLER_ARGS)732 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
733 {
734 	char *avail, *p;
735 	int error;
736 	int i;
737 
738 	avail = malloc(256, M_TEMP, M_WAITOK);
739 	p = avail;
740 	for (i = 0; i < nitems(idle_tbl); i++) {
741 		if (idle_tbl[i].id_cpuid2_flag != 0 &&
742 		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
743 			continue;
744 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
745 		    cpu_idle_hook == NULL)
746 			continue;
747 		p += sprintf(p, "%s%s", p != avail ? ", " : "",
748 		    idle_tbl[i].id_name);
749 	}
750 	error = sysctl_handle_string(oidp, avail, 0, req);
751 	free(avail, M_TEMP);
752 	return (error);
753 }
754 
755 SYSCTL_PROC(_machdep, OID_AUTO, idle_available,
756     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
757     0, 0, idle_sysctl_available, "A",
758     "list of available idle functions");
759 
760 static bool
cpu_idle_selector(const char * new_idle_name)761 cpu_idle_selector(const char *new_idle_name)
762 {
763 	int i;
764 
765 	for (i = 0; i < nitems(idle_tbl); i++) {
766 		if (idle_tbl[i].id_cpuid2_flag != 0 &&
767 		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
768 			continue;
769 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
770 		    cpu_idle_hook == NULL)
771 			continue;
772 		if (strcmp(idle_tbl[i].id_name, new_idle_name))
773 			continue;
774 		cpu_idle_fn = idle_tbl[i].id_fn;
775 		if (bootverbose)
776 			printf("CPU idle set to %s\n", idle_tbl[i].id_name);
777 		return (true);
778 	}
779 	return (false);
780 }
781 
782 static int
cpu_idle_sysctl(SYSCTL_HANDLER_ARGS)783 cpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
784 {
785 	char buf[16];
786 	const char *p;
787 	int error, i;
788 
789 	p = "unknown";
790 	for (i = 0; i < nitems(idle_tbl); i++) {
791 		if (idle_tbl[i].id_fn == cpu_idle_fn) {
792 			p = idle_tbl[i].id_name;
793 			break;
794 		}
795 	}
796 	strncpy(buf, p, sizeof(buf));
797 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
798 	if (error != 0 || req->newptr == NULL)
799 		return (error);
800 	return (cpu_idle_selector(buf) ? 0 : EINVAL);
801 }
802 
803 SYSCTL_PROC(_machdep, OID_AUTO, idle,
804     CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
805     0, 0, cpu_idle_sysctl, "A",
806     "currently selected idle function");
807 
808 static void
cpu_idle_tun(void * unused __unused)809 cpu_idle_tun(void *unused __unused)
810 {
811 	char tunvar[16];
812 
813 	if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar)))
814 		cpu_idle_selector(tunvar);
815 	else if (cpu_vendor_id == CPU_VENDOR_AMD &&
816 	    CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) {
817 		/* Ryzen erratas 1057, 1109. */
818 		cpu_idle_selector("hlt");
819 		idle_mwait = 0;
820 		mwait_cpustop_broken = true;
821 	}
822 
823 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
824 	    CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x5c) {
825 		/*
826 		 * Apollo Lake errata APL31 (public errata APL30).
827 		 * Stores to the armed address range may not trigger
828 		 * MWAIT to resume execution.  OS needs to use
829 		 * interrupts to wake processors from MWAIT-induced
830 		 * sleep states.
831 		 */
832 		cpu_idle_apl31_workaround = 1;
833 		mwait_cpustop_broken = true;
834 	}
835 	TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround);
836 }
837 SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL);
838 
839 static int panic_on_nmi = 0xff;
840 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
841     &panic_on_nmi, 0,
842     "Panic on NMI: 1 = H/W failure; 2 = unknown; 0xff = all");
843 int nmi_is_broadcast = 1;
844 SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN,
845     &nmi_is_broadcast, 0,
846     "Chipset NMI is broadcast");
847 int (*apei_nmi)(void);
848 
849 void
nmi_call_kdb(u_int cpu,u_int type,struct trapframe * frame)850 nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame)
851 {
852 	bool claimed = false;
853 
854 #ifdef DEV_ISA
855 	/* machine/parity/power fail/"kitchen sink" faults */
856 	if (isa_nmi(frame->tf_err)) {
857 		claimed = true;
858 		if ((panic_on_nmi & 1) != 0)
859 			panic("NMI indicates hardware failure");
860 	}
861 #endif /* DEV_ISA */
862 
863 	/* ACPI Platform Error Interfaces callback. */
864 	if (apei_nmi != NULL && (*apei_nmi)())
865 		claimed = true;
866 
867 	/*
868 	 * NMIs can be useful for debugging.  They can be hooked up to a
869 	 * pushbutton, usually on an ISA, PCI, or PCIe card.  They can also be
870 	 * generated by an IPMI BMC, either manually or in response to a
871 	 * watchdog timeout.  For example, see the "power diag" command in
872 	 * ports/sysutils/ipmitool.  They can also be generated by a
873 	 * hypervisor; see "bhyvectl --inject-nmi".
874 	 */
875 
876 #ifdef KDB
877 	if (!claimed && (panic_on_nmi & 2) != 0) {
878 		if (debugger_on_panic) {
879 			printf("NMI/cpu%d ... going to debugger\n", cpu);
880 			claimed = kdb_trap(type, 0, frame);
881 		}
882 	}
883 #endif /* KDB */
884 
885 	if (!claimed && panic_on_nmi != 0)
886 		panic("NMI");
887 }
888 
889 /*
890  * Dynamically registered NMI handlers.
891  */
892 struct nmi_handler {
893 	int running;
894 	int (*func)(struct trapframe *);
895 	struct nmi_handler *next;
896 };
897 static struct nmi_handler *nmi_handlers_head = NULL;
898 MALLOC_DEFINE(M_NMI, "NMI handlers",
899     "List entries for dynamically registered NMI handlers");
900 
901 void
nmi_register_handler(int (* handler)(struct trapframe *))902 nmi_register_handler(int (*handler)(struct trapframe *))
903 {
904 	struct nmi_handler *hp;
905 	int (*hpf)(struct trapframe *);
906 
907 	hp = (struct nmi_handler *)atomic_load_acq_ptr(
908 	    (uintptr_t *)&nmi_handlers_head);
909 	while (hp != NULL) {
910 		hpf = hp->func;
911 		MPASS(hpf != handler);
912 		if (hpf == NULL &&
913 		    atomic_cmpset_ptr((volatile uintptr_t *)&hp->func,
914 		    (uintptr_t)NULL, (uintptr_t)handler) != 0) {
915 			hp->running = 0;
916 			return;
917 		}
918 		hp = (struct nmi_handler *)atomic_load_acq_ptr(
919 		    (uintptr_t *)&hp->next);
920 	}
921 	hp = malloc(sizeof(struct nmi_handler), M_NMI, M_WAITOK | M_ZERO);
922 	hp->func = handler;
923 	hp->next = nmi_handlers_head;
924 	while (atomic_fcmpset_rel_ptr(
925 	    (volatile uintptr_t *)&nmi_handlers_head,
926 	    (uintptr_t *)&hp->next, (uintptr_t)hp) == 0)
927 	        ;
928 }
929 
930 void
nmi_remove_handler(int (* handler)(struct trapframe *))931 nmi_remove_handler(int (*handler)(struct trapframe *))
932 {
933 	struct nmi_handler *hp;
934 
935 	hp = (struct nmi_handler *)atomic_load_acq_ptr(
936 	    (uintptr_t *)&nmi_handlers_head);
937 	while (hp != NULL) {
938 		if (hp->func == handler) {
939 			hp->func = NULL;
940 			/* Wait for the handler to exit before returning. */
941 			while (atomic_load_int(&hp->running) != 0)
942 				cpu_spinwait();
943 			return;
944 		}
945 		hp = (struct nmi_handler *)atomic_load_acq_ptr(
946 		    (uintptr_t *)&hp->next);
947 	}
948 
949 	panic("%s: attempting to remove an unregistered NMI handler %p\n",
950 	    __func__, handler);
951 }
952 
953 void
nmi_handle_intr(struct trapframe * frame)954 nmi_handle_intr(struct trapframe *frame)
955 {
956 	int (*func)(struct trapframe *);
957 	struct nmi_handler *hp;
958 	bool handled;
959 
960 #ifdef SMP
961 	/* Handler for NMI IPIs used for stopping CPUs. */
962 	if (ipi_nmi_handler() == 0)
963 		return;
964 #endif
965 	handled = false;
966 	hp = (struct nmi_handler *)atomic_load_acq_ptr(
967 	    (uintptr_t *)&nmi_handlers_head);
968 	while (hp != NULL) {
969 		func = hp->func;
970 		if (func != NULL) {
971 			atomic_add_int(&hp->running, 1);
972 			if (func(frame) != 0)
973 				handled = true;
974 			atomic_subtract_int(&hp->running, 1);
975 		}
976 		hp = (struct nmi_handler *)atomic_load_acq_ptr(
977 		    (uintptr_t *)&hp->next);
978 	}
979 	if (handled)
980 		return;
981 #ifdef SMP
982 	if (nmi_is_broadcast) {
983 		nmi_call_kdb_smp(T_NMI, frame);
984 		return;
985 	}
986 #endif
987 	nmi_call_kdb(PCPU_GET(cpuid), T_NMI, frame);
988 }
989 
990 static int hw_ibrs_active;
991 int hw_ibrs_ibpb_active;
992 int hw_ibrs_disable = 1;
993 
994 SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0,
995     "Indirect Branch Restricted Speculation active");
996 
997 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, ibrs,
998     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
999     "Indirect Branch Restricted Speculation active");
1000 
1001 SYSCTL_INT(_machdep_mitigations_ibrs, OID_AUTO, active, CTLFLAG_RD,
1002     &hw_ibrs_active, 0, "Indirect Branch Restricted Speculation active");
1003 
1004 void
hw_ibrs_recalculate(bool for_all_cpus)1005 hw_ibrs_recalculate(bool for_all_cpus)
1006 {
1007 	if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) {
1008 		x86_msr_op(MSR_IA32_SPEC_CTRL, (for_all_cpus ?
1009 		    MSR_OP_RENDEZVOUS_ALL : MSR_OP_LOCAL) |
1010 		    (hw_ibrs_disable != 0 ? MSR_OP_ANDNOT : MSR_OP_OR),
1011 		    IA32_SPEC_CTRL_IBRS, NULL);
1012 		hw_ibrs_active = hw_ibrs_disable == 0;
1013 		hw_ibrs_ibpb_active = 0;
1014 	} else {
1015 		hw_ibrs_active = hw_ibrs_ibpb_active = (cpu_stdext_feature3 &
1016 		    CPUID_STDEXT3_IBPB) != 0 && !hw_ibrs_disable;
1017 	}
1018 }
1019 
1020 static int
hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)1021 hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)
1022 {
1023 	int error, val;
1024 
1025 	val = hw_ibrs_disable;
1026 	error = sysctl_handle_int(oidp, &val, 0, req);
1027 	if (error != 0 || req->newptr == NULL)
1028 		return (error);
1029 	hw_ibrs_disable = val != 0;
1030 	hw_ibrs_recalculate(true);
1031 	return (0);
1032 }
1033 SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
1034     CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I",
1035     "Disable Indirect Branch Restricted Speculation");
1036 
1037 SYSCTL_PROC(_machdep_mitigations_ibrs, OID_AUTO, disable, CTLTYPE_INT |
1038     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1039     hw_ibrs_disable_handler, "I",
1040     "Disable Indirect Branch Restricted Speculation");
1041 
1042 int hw_ssb_active;
1043 int hw_ssb_disable;
1044 
1045 SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD,
1046     &hw_ssb_active, 0,
1047     "Speculative Store Bypass Disable active");
1048 
1049 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, ssb,
1050     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1051     "Speculative Store Bypass Disable active");
1052 
1053 SYSCTL_INT(_machdep_mitigations_ssb, OID_AUTO, active, CTLFLAG_RD,
1054     &hw_ssb_active, 0, "Speculative Store Bypass Disable active");
1055 
1056 static void
hw_ssb_set(bool enable,bool for_all_cpus)1057 hw_ssb_set(bool enable, bool for_all_cpus)
1058 {
1059 
1060 	if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) {
1061 		hw_ssb_active = 0;
1062 		return;
1063 	}
1064 	hw_ssb_active = enable;
1065 	x86_msr_op(MSR_IA32_SPEC_CTRL,
1066 	    (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
1067 	    (for_all_cpus ? MSR_OP_SCHED_ALL : MSR_OP_LOCAL),
1068 	    IA32_SPEC_CTRL_SSBD, NULL);
1069 }
1070 
1071 void
hw_ssb_recalculate(bool all_cpus)1072 hw_ssb_recalculate(bool all_cpus)
1073 {
1074 
1075 	switch (hw_ssb_disable) {
1076 	default:
1077 		hw_ssb_disable = 0;
1078 		/* FALLTHROUGH */
1079 	case 0: /* off */
1080 		hw_ssb_set(false, all_cpus);
1081 		break;
1082 	case 1: /* on */
1083 		hw_ssb_set(true, all_cpus);
1084 		break;
1085 	case 2: /* auto */
1086 		hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ?
1087 		    false : true, all_cpus);
1088 		break;
1089 	}
1090 }
1091 
1092 static int
hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS)1093 hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS)
1094 {
1095 	int error, val;
1096 
1097 	val = hw_ssb_disable;
1098 	error = sysctl_handle_int(oidp, &val, 0, req);
1099 	if (error != 0 || req->newptr == NULL)
1100 		return (error);
1101 	hw_ssb_disable = val;
1102 	hw_ssb_recalculate(true);
1103 	return (0);
1104 }
1105 SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT |
1106     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1107     hw_ssb_disable_handler, "I",
1108     "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto)");
1109 
1110 SYSCTL_PROC(_machdep_mitigations_ssb, OID_AUTO, disable, CTLTYPE_INT |
1111     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1112     hw_ssb_disable_handler, "I",
1113     "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto)");
1114 
1115 int hw_mds_disable;
1116 
1117 /*
1118  * Handler for Microarchitectural Data Sampling issues.  Really not a
1119  * pointer to C function: on amd64 the code must not change any CPU
1120  * architectural state except possibly %rflags. Also, it is always
1121  * called with interrupts disabled.
1122  */
1123 void mds_handler_void(void);
1124 void mds_handler_verw(void);
1125 void mds_handler_ivb(void);
1126 void mds_handler_bdw(void);
1127 void mds_handler_skl_sse(void);
1128 void mds_handler_skl_avx(void);
1129 void mds_handler_skl_avx512(void);
1130 void mds_handler_silvermont(void);
1131 void (*mds_handler)(void) = mds_handler_void;
1132 
1133 static int
sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS)1134 sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS)
1135 {
1136 	const char *state;
1137 
1138 	if (mds_handler == mds_handler_void)
1139 		state = "inactive";
1140 	else if (mds_handler == mds_handler_verw)
1141 		state = "VERW";
1142 	else if (mds_handler == mds_handler_ivb)
1143 		state = "software IvyBridge";
1144 	else if (mds_handler == mds_handler_bdw)
1145 		state = "software Broadwell";
1146 	else if (mds_handler == mds_handler_skl_sse)
1147 		state = "software Skylake SSE";
1148 	else if (mds_handler == mds_handler_skl_avx)
1149 		state = "software Skylake AVX";
1150 	else if (mds_handler == mds_handler_skl_avx512)
1151 		state = "software Skylake AVX512";
1152 	else if (mds_handler == mds_handler_silvermont)
1153 		state = "software Silvermont";
1154 	else
1155 		state = "unknown";
1156 	return (SYSCTL_OUT(req, state, strlen(state)));
1157 }
1158 
1159 SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state,
1160     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1161     sysctl_hw_mds_disable_state_handler, "A",
1162     "Microarchitectural Data Sampling Mitigation state");
1163 
1164 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, mds,
1165     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1166     "Microarchitectural Data Sampling Mitigation state");
1167 
1168 SYSCTL_PROC(_machdep_mitigations_mds, OID_AUTO, state,
1169     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1170     sysctl_hw_mds_disable_state_handler, "A",
1171     "Microarchitectural Data Sampling Mitigation state");
1172 
1173 _Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512");
1174 
1175 void
hw_mds_recalculate(void)1176 hw_mds_recalculate(void)
1177 {
1178 	struct pcpu *pc;
1179 	vm_offset_t b64;
1180 	u_long xcr0;
1181 	int i;
1182 
1183 	/*
1184 	 * Allow user to force VERW variant even if MD_CLEAR is not
1185 	 * reported.  For instance, hypervisor might unknowingly
1186 	 * filter the cap out.
1187 	 * For the similar reasons, and for testing, allow to enable
1188 	 * mitigation even when MDS_NO cap is set.
1189 	 */
1190 	if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 ||
1191 	    ((cpu_ia32_arch_caps & IA32_ARCH_CAP_MDS_NO) != 0 &&
1192 	    hw_mds_disable == 3)) {
1193 		mds_handler = mds_handler_void;
1194 	} else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 &&
1195 	    hw_mds_disable == 3) || hw_mds_disable == 1) {
1196 		mds_handler = mds_handler_verw;
1197 	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1198 	    (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e ||
1199 	    CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a ||
1200 	    CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 ||
1201 	    CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d ||
1202 	    CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e ||
1203 	    CPUID_TO_MODEL(cpu_id) == 0x3a) &&
1204 	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
1205 		/*
1206 		 * Nehalem, SandyBridge, IvyBridge
1207 		 */
1208 		CPU_FOREACH(i) {
1209 			pc = pcpu_find(i);
1210 			if (pc->pc_mds_buf == NULL) {
1211 				pc->pc_mds_buf = malloc_domainset(672, M_TEMP,
1212 				    DOMAINSET_PREF(pc->pc_domain), M_WAITOK);
1213 				bzero(pc->pc_mds_buf, 16);
1214 			}
1215 		}
1216 		mds_handler = mds_handler_ivb;
1217 	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1218 	    (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c ||
1219 	    CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 ||
1220 	    CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f ||
1221 	    CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) &&
1222 	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
1223 		/*
1224 		 * Haswell, Broadwell
1225 		 */
1226 		CPU_FOREACH(i) {
1227 			pc = pcpu_find(i);
1228 			if (pc->pc_mds_buf == NULL) {
1229 				pc->pc_mds_buf = malloc_domainset(1536, M_TEMP,
1230 				    DOMAINSET_PREF(pc->pc_domain), M_WAITOK);
1231 				bzero(pc->pc_mds_buf, 16);
1232 			}
1233 		}
1234 		mds_handler = mds_handler_bdw;
1235 	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1236 	    ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id &
1237 	    CPUID_STEPPING) <= 5) ||
1238 	    CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e ||
1239 	    (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id &
1240 	    CPUID_STEPPING) <= 0xb) ||
1241 	    (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id &
1242 	    CPUID_STEPPING) <= 0xc)) &&
1243 	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
1244 		/*
1245 		 * Skylake, KabyLake, CoffeeLake, WhiskeyLake,
1246 		 * CascadeLake
1247 		 */
1248 		CPU_FOREACH(i) {
1249 			pc = pcpu_find(i);
1250 			if (pc->pc_mds_buf == NULL) {
1251 				pc->pc_mds_buf = malloc_domainset(6 * 1024,
1252 				    M_TEMP, DOMAINSET_PREF(pc->pc_domain),
1253 				    M_WAITOK);
1254 				b64 = (vm_offset_t)malloc_domainset(64 + 63,
1255 				    M_TEMP, DOMAINSET_PREF(pc->pc_domain),
1256 				    M_WAITOK);
1257 				pc->pc_mds_buf64 = (void *)roundup2(b64, 64);
1258 				bzero(pc->pc_mds_buf64, 64);
1259 			}
1260 		}
1261 		xcr0 = rxcr(0);
1262 		if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 &&
1263 		    (cpu_stdext_feature & CPUID_STDEXT_AVX512DQ) != 0)
1264 			mds_handler = mds_handler_skl_avx512;
1265 		else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 &&
1266 		    (cpu_feature2 & CPUID2_AVX) != 0)
1267 			mds_handler = mds_handler_skl_avx;
1268 		else
1269 			mds_handler = mds_handler_skl_sse;
1270 	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1271 	    ((CPUID_TO_MODEL(cpu_id) == 0x37 ||
1272 	    CPUID_TO_MODEL(cpu_id) == 0x4a ||
1273 	    CPUID_TO_MODEL(cpu_id) == 0x4c ||
1274 	    CPUID_TO_MODEL(cpu_id) == 0x4d ||
1275 	    CPUID_TO_MODEL(cpu_id) == 0x5a ||
1276 	    CPUID_TO_MODEL(cpu_id) == 0x5d ||
1277 	    CPUID_TO_MODEL(cpu_id) == 0x6e ||
1278 	    CPUID_TO_MODEL(cpu_id) == 0x65 ||
1279 	    CPUID_TO_MODEL(cpu_id) == 0x75 ||
1280 	    CPUID_TO_MODEL(cpu_id) == 0x1c ||
1281 	    CPUID_TO_MODEL(cpu_id) == 0x26 ||
1282 	    CPUID_TO_MODEL(cpu_id) == 0x27 ||
1283 	    CPUID_TO_MODEL(cpu_id) == 0x35 ||
1284 	    CPUID_TO_MODEL(cpu_id) == 0x36 ||
1285 	    CPUID_TO_MODEL(cpu_id) == 0x7a))) {
1286 		/* Silvermont, Airmont */
1287 		CPU_FOREACH(i) {
1288 			pc = pcpu_find(i);
1289 			if (pc->pc_mds_buf == NULL)
1290 				pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK);
1291 		}
1292 		mds_handler = mds_handler_silvermont;
1293 	} else {
1294 		hw_mds_disable = 0;
1295 		mds_handler = mds_handler_void;
1296 	}
1297 }
1298 
1299 static void
hw_mds_recalculate_boot(void * arg __unused)1300 hw_mds_recalculate_boot(void *arg __unused)
1301 {
1302 
1303 	hw_mds_recalculate();
1304 }
1305 SYSINIT(mds_recalc, SI_SUB_SMP, SI_ORDER_ANY, hw_mds_recalculate_boot, NULL);
1306 
1307 static int
sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS)1308 sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS)
1309 {
1310 	int error, val;
1311 
1312 	val = hw_mds_disable;
1313 	error = sysctl_handle_int(oidp, &val, 0, req);
1314 	if (error != 0 || req->newptr == NULL)
1315 		return (error);
1316 	if (val < 0 || val > 3)
1317 		return (EINVAL);
1318 	hw_mds_disable = val;
1319 	hw_mds_recalculate();
1320 	return (0);
1321 }
1322 
1323 SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT |
1324     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1325     sysctl_mds_disable_handler, "I",
1326     "Microarchitectural Data Sampling Mitigation "
1327     "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO)");
1328 
1329 SYSCTL_PROC(_machdep_mitigations_mds, OID_AUTO, disable, CTLTYPE_INT |
1330     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1331     sysctl_mds_disable_handler, "I",
1332     "Microarchitectural Data Sampling Mitigation "
1333     "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO)");
1334 
1335 /*
1336  * Intel Transactional Memory Asynchronous Abort Mitigation
1337  * CVE-2019-11135
1338  */
1339 int x86_taa_enable;
1340 int x86_taa_state;
1341 enum {
1342 	TAA_NONE	= 0,	/* No mitigation enabled */
1343 	TAA_TSX_DISABLE	= 1,	/* Disable TSX via MSR */
1344 	TAA_VERW	= 2,	/* Use VERW mitigation */
1345 	TAA_AUTO	= 3,	/* Automatically select the mitigation */
1346 
1347 	/* The states below are not selectable by the operator */
1348 
1349 	TAA_TAA_UC	= 4,	/* Mitigation present in microcode */
1350 	TAA_NOT_PRESENT	= 5	/* TSX is not present */
1351 };
1352 
1353 static void
taa_set(bool enable,bool all)1354 taa_set(bool enable, bool all)
1355 {
1356 
1357 	x86_msr_op(MSR_IA32_TSX_CTRL,
1358 	    (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
1359 	    (all ? MSR_OP_RENDEZVOUS_ALL : MSR_OP_LOCAL),
1360 	    IA32_TSX_CTRL_RTM_DISABLE | IA32_TSX_CTRL_TSX_CPUID_CLEAR,
1361 	    NULL);
1362 }
1363 
1364 void
x86_taa_recalculate(void)1365 x86_taa_recalculate(void)
1366 {
1367 	static int taa_saved_mds_disable = 0;
1368 	int taa_need = 0, taa_state = 0;
1369 	int mds_disable = 0, need_mds_recalc = 0;
1370 
1371 	/* Check CPUID.07h.EBX.HLE and RTM for the presence of TSX */
1372 	if ((cpu_stdext_feature & CPUID_STDEXT_HLE) == 0 ||
1373 	    (cpu_stdext_feature & CPUID_STDEXT_RTM) == 0) {
1374 		/* TSX is not present */
1375 		x86_taa_state = TAA_NOT_PRESENT;
1376 		return;
1377 	}
1378 
1379 	/* Check to see what mitigation options the CPU gives us */
1380 	if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TAA_NO) {
1381 		/* CPU is not suseptible to TAA */
1382 		taa_need = TAA_TAA_UC;
1383 	} else if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TSX_CTRL) {
1384 		/*
1385 		 * CPU can turn off TSX.  This is the next best option
1386 		 * if TAA_NO hardware mitigation isn't present
1387 		 */
1388 		taa_need = TAA_TSX_DISABLE;
1389 	} else {
1390 		/* No TSX/TAA specific remedies are available. */
1391 		if (x86_taa_enable == TAA_TSX_DISABLE) {
1392 			if (bootverbose)
1393 				printf("TSX control not available\n");
1394 			return;
1395 		} else
1396 			taa_need = TAA_VERW;
1397 	}
1398 
1399 	/* Can we automatically take action, or are we being forced? */
1400 	if (x86_taa_enable == TAA_AUTO)
1401 		taa_state = taa_need;
1402 	else
1403 		taa_state = x86_taa_enable;
1404 
1405 	/* No state change, nothing to do */
1406 	if (taa_state == x86_taa_state) {
1407 		if (bootverbose)
1408 			printf("No TSX change made\n");
1409 		return;
1410 	}
1411 
1412 	/* Does the MSR need to be turned on or off? */
1413 	if (taa_state == TAA_TSX_DISABLE)
1414 		taa_set(true, true);
1415 	else if (x86_taa_state == TAA_TSX_DISABLE)
1416 		taa_set(false, true);
1417 
1418 	/* Does MDS need to be set to turn on VERW? */
1419 	if (taa_state == TAA_VERW) {
1420 		taa_saved_mds_disable = hw_mds_disable;
1421 		mds_disable = hw_mds_disable = 1;
1422 		need_mds_recalc = 1;
1423 	} else if (x86_taa_state == TAA_VERW) {
1424 		mds_disable = hw_mds_disable = taa_saved_mds_disable;
1425 		need_mds_recalc = 1;
1426 	}
1427 	if (need_mds_recalc) {
1428 		hw_mds_recalculate();
1429 		if (mds_disable != hw_mds_disable) {
1430 			if (bootverbose)
1431 				printf("Cannot change MDS state for TAA\n");
1432 			/* Don't update our state */
1433 			return;
1434 		}
1435 	}
1436 
1437 	x86_taa_state = taa_state;
1438 	return;
1439 }
1440 
1441 static void
taa_recalculate_boot(void * arg __unused)1442 taa_recalculate_boot(void * arg __unused)
1443 {
1444 
1445 	x86_taa_recalculate();
1446 }
1447 SYSINIT(taa_recalc, SI_SUB_SMP, SI_ORDER_ANY, taa_recalculate_boot, NULL);
1448 
1449 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, taa,
1450     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1451     "TSX Asynchronous Abort Mitigation");
1452 
1453 static int
sysctl_taa_handler(SYSCTL_HANDLER_ARGS)1454 sysctl_taa_handler(SYSCTL_HANDLER_ARGS)
1455 {
1456 	int error, val;
1457 
1458 	val = x86_taa_enable;
1459 	error = sysctl_handle_int(oidp, &val, 0, req);
1460 	if (error != 0 || req->newptr == NULL)
1461 		return (error);
1462 	if (val < TAA_NONE || val > TAA_AUTO)
1463 		return (EINVAL);
1464 	x86_taa_enable = val;
1465 	x86_taa_recalculate();
1466 	return (0);
1467 }
1468 
1469 SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, enable, CTLTYPE_INT |
1470     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1471     sysctl_taa_handler, "I",
1472     "TAA Mitigation enablement control "
1473     "(0 - off, 1 - disable TSX, 2 - VERW, 3 - on AUTO)");
1474 
1475 static int
sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS)1476 sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS)
1477 {
1478 	const char *state;
1479 
1480 	switch (x86_taa_state) {
1481 	case TAA_NONE:
1482 		state = "inactive";
1483 		break;
1484 	case TAA_TSX_DISABLE:
1485 		state = "TSX disabled";
1486 		break;
1487 	case TAA_VERW:
1488 		state = "VERW";
1489 		break;
1490 	case TAA_TAA_UC:
1491 		state = "Mitigated in microcode";
1492 		break;
1493 	case TAA_NOT_PRESENT:
1494 		state = "TSX not present";
1495 		break;
1496 	default:
1497 		state = "unknown";
1498 	}
1499 
1500 	return (SYSCTL_OUT(req, state, strlen(state)));
1501 }
1502 
1503 SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, state,
1504     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1505     sysctl_taa_state_handler, "A",
1506     "TAA Mitigation state");
1507 
1508 int __read_frequently cpu_flush_rsb_ctxsw;
1509 SYSCTL_INT(_machdep_mitigations, OID_AUTO, flush_rsb_ctxsw,
1510     CTLFLAG_RW | CTLFLAG_NOFETCH, &cpu_flush_rsb_ctxsw, 0,
1511     "Flush Return Stack Buffer on context switch");
1512 
1513 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, rngds,
1514     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1515     "MCU Optimization, disable RDSEED mitigation");
1516 
1517 int x86_rngds_mitg_enable = 1;
1518 void
x86_rngds_mitg_recalculate(bool all_cpus)1519 x86_rngds_mitg_recalculate(bool all_cpus)
1520 {
1521 	if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0)
1522 		return;
1523 	x86_msr_op(MSR_IA32_MCU_OPT_CTRL,
1524 	    (x86_rngds_mitg_enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
1525 	    (all_cpus ? MSR_OP_RENDEZVOUS_ALL : MSR_OP_LOCAL),
1526 	    IA32_RNGDS_MITG_DIS, NULL);
1527 }
1528 
1529 static int
sysctl_rngds_mitg_enable_handler(SYSCTL_HANDLER_ARGS)1530 sysctl_rngds_mitg_enable_handler(SYSCTL_HANDLER_ARGS)
1531 {
1532 	int error, val;
1533 
1534 	val = x86_rngds_mitg_enable;
1535 	error = sysctl_handle_int(oidp, &val, 0, req);
1536 	if (error != 0 || req->newptr == NULL)
1537 		return (error);
1538 	x86_rngds_mitg_enable = val;
1539 	x86_rngds_mitg_recalculate(true);
1540 	return (0);
1541 }
1542 SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, enable, CTLTYPE_INT |
1543     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1544     sysctl_rngds_mitg_enable_handler, "I",
1545     "MCU Optimization, disabling RDSEED mitigation control "
1546     "(0 - mitigation disabled (RDSEED optimized), 1 - mitigation enabled)");
1547 
1548 static int
sysctl_rngds_state_handler(SYSCTL_HANDLER_ARGS)1549 sysctl_rngds_state_handler(SYSCTL_HANDLER_ARGS)
1550 {
1551 	const char *state;
1552 
1553 	if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0) {
1554 		state = "Not applicable";
1555 	} else if (x86_rngds_mitg_enable == 0) {
1556 		state = "RDSEED not serialized";
1557 	} else {
1558 		state = "Mitigated";
1559 	}
1560 	return (SYSCTL_OUT(req, state, strlen(state)));
1561 }
1562 SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, state,
1563     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1564     sysctl_rngds_state_handler, "A",
1565     "MCU Optimization state");
1566 
1567 
1568 /*
1569  * Zenbleed.
1570  *
1571  * No corresponding errata is publicly listed.  AMD has issued a security
1572  * bulletin (AMD-SB-7008), entitled "Cross-Process Information Leak".  This
1573  * document lists (as of August 2023) platform firmware's availability target
1574  * dates, with most being November/December 2023.  It will then be up to
1575  * motherboard manufacturers to produce corresponding BIOS updates, which will
1576  * happen with an inevitable lag.  Additionally, for a variety of reasons,
1577  * operators might not be able to apply them everywhere due.  On the side of
1578  * standalone CPU microcodes, no plans for availability have been published so
1579  * far.  However, a developer appearing to be an AMD employee has hardcoded in
1580  * Linux revision numbers of future microcodes that are presumed to fix the
1581  * vulnerability.
1582  *
1583  * Given the stability issues encountered with early microcode releases for Rome
1584  * (the only microcode publicly released so far) and the absence of official
1585  * communication on standalone CPU microcodes, we have opted instead for
1586  * matching by default all AMD Zen2 processors which, according to the
1587  * vulnerability's discoverer, are all affected (see
1588  * https://lock.cmpxchg8b.com/zenbleed.html).  This policy, also adopted by
1589  * OpenBSD, may be overriden using the tunable/sysctl
1590  * 'machdep.mitigations.zenbleed.enable'.  We might revise it later depending on
1591  * official statements, microcode updates' public availability and community
1592  * assessment that they actually fix the vulnerability without any instability
1593  * side effects.
1594  */
1595 
1596 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, zenbleed,
1597     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1598     "Zenbleed OS-triggered prevention (via chicken bit)");
1599 
1600 /* 2 is auto, see below. */
1601 int zenbleed_enable = 2;
1602 
1603 void
zenbleed_sanitize_enable(void)1604 zenbleed_sanitize_enable(void)
1605 {
1606 	/* Default to auto (2). */
1607 	if (zenbleed_enable < 0 || zenbleed_enable > 2)
1608 		zenbleed_enable = 2;
1609 }
1610 
1611 static bool
zenbleed_chicken_bit_applicable(void)1612 zenbleed_chicken_bit_applicable(void)
1613 {
1614 	/* Concerns only bare-metal AMD Zen2 processors. */
1615 	return (cpu_vendor_id == CPU_VENDOR_AMD &&
1616 	    CPUID_TO_FAMILY(cpu_id) == 0x17 &&
1617 	    CPUID_TO_MODEL(cpu_id) >= 0x30 &&
1618 	    vm_guest == VM_GUEST_NO);
1619 }
1620 
1621 static bool
zenbleed_chicken_bit_should_enable(void)1622 zenbleed_chicken_bit_should_enable(void)
1623 {
1624 	/*
1625 	 * Obey tunable/sysctl.
1626 	 *
1627 	 * As explained above, currently, the automatic setting (2) and the "on"
1628 	 * one (1) have the same effect.  In the future, we might additionally
1629 	 * check for specific microcode revisions as part of the automatic
1630 	 * determination.
1631 	 */
1632 	return (zenbleed_enable != 0);
1633 }
1634 
1635 void
zenbleed_check_and_apply(bool all_cpus)1636 zenbleed_check_and_apply(bool all_cpus)
1637 {
1638 	bool set;
1639 
1640 	if (!zenbleed_chicken_bit_applicable())
1641 		return;
1642 
1643 	set = zenbleed_chicken_bit_should_enable();
1644 
1645 	x86_msr_op(MSR_DE_CFG,
1646 	    (set ? MSR_OP_OR : MSR_OP_ANDNOT) |
1647 	    (all_cpus ? MSR_OP_RENDEZVOUS_ALL : MSR_OP_LOCAL),
1648 	    DE_CFG_ZEN2_FP_BACKUP_FIX_BIT, NULL);
1649 }
1650 
1651 static int
sysctl_zenbleed_enable_handler(SYSCTL_HANDLER_ARGS)1652 sysctl_zenbleed_enable_handler(SYSCTL_HANDLER_ARGS)
1653 {
1654 	int error, val;
1655 
1656 	val = zenbleed_enable;
1657 	error = sysctl_handle_int(oidp, &val, 0, req);
1658 	if (error != 0 || req->newptr == NULL)
1659 		return (error);
1660 	zenbleed_enable = val;
1661 	zenbleed_sanitize_enable();
1662 	zenbleed_check_and_apply(true);
1663 	return (0);
1664 }
1665 SYSCTL_PROC(_machdep_mitigations_zenbleed, OID_AUTO, enable, CTLTYPE_INT |
1666     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1667     sysctl_zenbleed_enable_handler, "I",
1668     "Enable Zenbleed OS-triggered mitigation (chicken bit) "
1669     "(0: Force disable, 1: Force enable, 2: Automatic determination)");
1670 
1671 static int
sysctl_zenbleed_state_handler(SYSCTL_HANDLER_ARGS)1672 sysctl_zenbleed_state_handler(SYSCTL_HANDLER_ARGS)
1673 {
1674 	const char *state;
1675 
1676 	if (!zenbleed_chicken_bit_applicable())
1677 		state = "Not applicable";
1678 	else if (zenbleed_chicken_bit_should_enable())
1679 		state = "Mitigation enabled";
1680 	else
1681 		state = "Mitigation disabled";
1682 	return (SYSCTL_OUT(req, state, strlen(state)));
1683 }
1684 SYSCTL_PROC(_machdep_mitigations_zenbleed, OID_AUTO, state,
1685     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1686     sysctl_zenbleed_state_handler, "A",
1687     "Zenbleed OS-triggered mitigation (chicken bit) state");
1688 
1689 
1690 /*
1691  * Enable and restore kernel text write permissions.
1692  * Callers must ensure that disable_wp()/restore_wp() are executed
1693  * without rescheduling on the same core.
1694  */
1695 bool
disable_wp(void)1696 disable_wp(void)
1697 {
1698 	u_int cr0;
1699 
1700 	cr0 = rcr0();
1701 	if ((cr0 & CR0_WP) == 0)
1702 		return (false);
1703 	load_cr0(cr0 & ~CR0_WP);
1704 	return (true);
1705 }
1706 
1707 void
restore_wp(bool old_wp)1708 restore_wp(bool old_wp)
1709 {
1710 
1711 	if (old_wp)
1712 		load_cr0(rcr0() | CR0_WP);
1713 }
1714 
1715 bool
acpi_get_fadt_bootflags(uint16_t * flagsp)1716 acpi_get_fadt_bootflags(uint16_t *flagsp)
1717 {
1718 #ifdef DEV_ACPI
1719 	ACPI_TABLE_FADT *fadt;
1720 	vm_paddr_t physaddr;
1721 
1722 	physaddr = acpi_find_table(ACPI_SIG_FADT);
1723 	if (physaddr == 0)
1724 		return (false);
1725 	fadt = acpi_map_table(physaddr, ACPI_SIG_FADT);
1726 	if (fadt == NULL)
1727 		return (false);
1728 	*flagsp = fadt->BootFlags;
1729 	acpi_unmap_table(fadt);
1730 	return (true);
1731 #else
1732 	return (false);
1733 #endif
1734 }
1735 
1736 DEFINE_IFUNC(, uint64_t, rdtsc_ordered, (void))
1737 {
1738 	bool cpu_is_amd = cpu_vendor_id == CPU_VENDOR_AMD ||
1739 	    cpu_vendor_id == CPU_VENDOR_HYGON;
1740 
1741 	if ((amd_feature & AMDID_RDTSCP) != 0)
1742 		return (rdtscp);
1743 	else if ((cpu_feature & CPUID_SSE2) != 0)
1744 		return (cpu_is_amd ? rdtsc_ordered_mfence :
1745 		    rdtsc_ordered_lfence);
1746 	else
1747 		return (rdtsc);
1748 }
1749