xref: /linux/arch/x86/kernel/cpu/intel.c (revision 5e2cb28dd7e182dfa641550dfa225913509ad45d)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/pgtable.h>
4 
5 #include <linux/string.h>
6 #include <linux/bitops.h>
7 #include <linux/smp.h>
8 #include <linux/sched.h>
9 #include <linux/sched/clock.h>
10 #include <linux/semaphore.h>
11 #include <linux/thread_info.h>
12 #include <linux/init.h>
13 #include <linux/uaccess.h>
14 #include <linux/workqueue.h>
15 #include <linux/delay.h>
16 #include <linux/cpuhotplug.h>
17 
18 #include <asm/cpufeature.h>
19 #include <asm/msr.h>
20 #include <asm/bugs.h>
21 #include <asm/cpu.h>
22 #include <asm/intel-family.h>
23 #include <asm/microcode.h>
24 #include <asm/hwcap2.h>
25 #include <asm/elf.h>
26 #include <asm/cpu_device_id.h>
27 #include <asm/cmdline.h>
28 #include <asm/traps.h>
29 #include <asm/resctrl.h>
30 #include <asm/numa.h>
31 #include <asm/thermal.h>
32 
33 #ifdef CONFIG_X86_64
34 #include <linux/topology.h>
35 #endif
36 
37 #include "cpu.h"
38 
39 #ifdef CONFIG_X86_LOCAL_APIC
40 #include <asm/mpspec.h>
41 #include <asm/apic.h>
42 #endif
43 
44 enum split_lock_detect_state {
45 	sld_off = 0,
46 	sld_warn,
47 	sld_fatal,
48 	sld_ratelimit,
49 };
50 
51 /*
52  * Default to sld_off because most systems do not support split lock detection.
53  * sld_state_setup() will switch this to sld_warn on systems that support
54  * split lock/bus lock detect, unless there is a command line override.
55  */
56 static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
57 static u64 msr_test_ctrl_cache __ro_after_init;
58 
59 /*
60  * With a name like MSR_TEST_CTL it should go without saying, but don't touch
61  * MSR_TEST_CTL unless the CPU is one of the whitelisted models.  Writing it
62  * on CPUs that do not support SLD can cause fireworks, even when writing '0'.
63  */
64 static bool cpu_model_supports_sld __ro_after_init;
65 
66 /*
67  * Processors which have self-snooping capability can handle conflicting
68  * memory type across CPUs by snooping its own cache. However, there exists
69  * CPU models in which having conflicting memory types still leads to
70  * unpredictable behavior, machine check errors, or hangs. Clear this
71  * feature to prevent its use on machines with known erratas.
72  */
73 static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c)
74 {
75 	switch (c->x86_model) {
76 	case INTEL_FAM6_CORE_YONAH:
77 	case INTEL_FAM6_CORE2_MEROM:
78 	case INTEL_FAM6_CORE2_MEROM_L:
79 	case INTEL_FAM6_CORE2_PENRYN:
80 	case INTEL_FAM6_CORE2_DUNNINGTON:
81 	case INTEL_FAM6_NEHALEM:
82 	case INTEL_FAM6_NEHALEM_G:
83 	case INTEL_FAM6_NEHALEM_EP:
84 	case INTEL_FAM6_NEHALEM_EX:
85 	case INTEL_FAM6_WESTMERE:
86 	case INTEL_FAM6_WESTMERE_EP:
87 	case INTEL_FAM6_SANDYBRIDGE:
88 		setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP);
89 	}
90 }
91 
92 static bool ring3mwait_disabled __read_mostly;
93 
94 static int __init ring3mwait_disable(char *__unused)
95 {
96 	ring3mwait_disabled = true;
97 	return 1;
98 }
99 __setup("ring3mwait=disable", ring3mwait_disable);
100 
101 static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
102 {
103 	/*
104 	 * Ring 3 MONITOR/MWAIT feature cannot be detected without
105 	 * cpu model and family comparison.
106 	 */
107 	if (c->x86 != 6)
108 		return;
109 	switch (c->x86_model) {
110 	case INTEL_FAM6_XEON_PHI_KNL:
111 	case INTEL_FAM6_XEON_PHI_KNM:
112 		break;
113 	default:
114 		return;
115 	}
116 
117 	if (ring3mwait_disabled)
118 		return;
119 
120 	set_cpu_cap(c, X86_FEATURE_RING3MWAIT);
121 	this_cpu_or(msr_misc_features_shadow,
122 		    1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
123 
124 	if (c == &boot_cpu_data)
125 		ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
126 }
127 
128 /*
129  * Early microcode releases for the Spectre v2 mitigation were broken.
130  * Information taken from;
131  * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/03/microcode-update-guidance.pdf
132  * - https://kb.vmware.com/s/article/52345
133  * - Microcode revisions observed in the wild
134  * - Release note from 20180108 microcode release
135  */
136 struct sku_microcode {
137 	u8 model;
138 	u8 stepping;
139 	u32 microcode;
140 };
141 static const struct sku_microcode spectre_bad_microcodes[] = {
142 	{ INTEL_FAM6_KABYLAKE,		0x0B,	0x80 },
143 	{ INTEL_FAM6_KABYLAKE,		0x0A,	0x80 },
144 	{ INTEL_FAM6_KABYLAKE,		0x09,	0x80 },
145 	{ INTEL_FAM6_KABYLAKE_L,	0x0A,	0x80 },
146 	{ INTEL_FAM6_KABYLAKE_L,	0x09,	0x80 },
147 	{ INTEL_FAM6_SKYLAKE_X,		0x03,	0x0100013e },
148 	{ INTEL_FAM6_SKYLAKE_X,		0x04,	0x0200003c },
149 	{ INTEL_FAM6_BROADWELL,		0x04,	0x28 },
150 	{ INTEL_FAM6_BROADWELL_G,	0x01,	0x1b },
151 	{ INTEL_FAM6_BROADWELL_D,	0x02,	0x14 },
152 	{ INTEL_FAM6_BROADWELL_D,	0x03,	0x07000011 },
153 	{ INTEL_FAM6_BROADWELL_X,	0x01,	0x0b000025 },
154 	{ INTEL_FAM6_HASWELL_L,		0x01,	0x21 },
155 	{ INTEL_FAM6_HASWELL_G,		0x01,	0x18 },
156 	{ INTEL_FAM6_HASWELL,		0x03,	0x23 },
157 	{ INTEL_FAM6_HASWELL_X,		0x02,	0x3b },
158 	{ INTEL_FAM6_HASWELL_X,		0x04,	0x10 },
159 	{ INTEL_FAM6_IVYBRIDGE_X,	0x04,	0x42a },
160 	/* Observed in the wild */
161 	{ INTEL_FAM6_SANDYBRIDGE_X,	0x06,	0x61b },
162 	{ INTEL_FAM6_SANDYBRIDGE_X,	0x07,	0x712 },
163 };
164 
165 static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
166 {
167 	int i;
168 
169 	/*
170 	 * We know that the hypervisor lie to us on the microcode version so
171 	 * we may as well hope that it is running the correct version.
172 	 */
173 	if (cpu_has(c, X86_FEATURE_HYPERVISOR))
174 		return false;
175 
176 	if (c->x86 != 6)
177 		return false;
178 
179 	for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
180 		if (c->x86_model == spectre_bad_microcodes[i].model &&
181 		    c->x86_stepping == spectre_bad_microcodes[i].stepping)
182 			return (c->microcode <= spectre_bad_microcodes[i].microcode);
183 	}
184 	return false;
185 }
186 
187 static void early_init_intel(struct cpuinfo_x86 *c)
188 {
189 	u64 misc_enable;
190 
191 	/* Unmask CPUID levels if masked: */
192 	if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
193 		if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
194 				  MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {
195 			c->cpuid_level = cpuid_eax(0);
196 			get_cpu_cap(c);
197 		}
198 	}
199 
200 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
201 		(c->x86 == 0x6 && c->x86_model >= 0x0e))
202 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
203 
204 	if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
205 		c->microcode = intel_get_microcode_revision();
206 
207 	/* Now if any of them are set, check the blacklist and clear the lot */
208 	if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
209 	     cpu_has(c, X86_FEATURE_INTEL_STIBP) ||
210 	     cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
211 	     cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
212 		pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n");
213 		setup_clear_cpu_cap(X86_FEATURE_IBRS);
214 		setup_clear_cpu_cap(X86_FEATURE_IBPB);
215 		setup_clear_cpu_cap(X86_FEATURE_STIBP);
216 		setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
217 		setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL);
218 		setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
219 		setup_clear_cpu_cap(X86_FEATURE_SSBD);
220 		setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD);
221 	}
222 
223 	/*
224 	 * Atom erratum AAE44/AAF40/AAG38/AAH41:
225 	 *
226 	 * A race condition between speculative fetches and invalidating
227 	 * a large page.  This is worked around in microcode, but we
228 	 * need the microcode to have already been loaded... so if it is
229 	 * not, recommend a BIOS update and disable large pages.
230 	 */
231 	if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 &&
232 	    c->microcode < 0x20e) {
233 		pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
234 		clear_cpu_cap(c, X86_FEATURE_PSE);
235 	}
236 
237 #ifdef CONFIG_X86_64
238 	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
239 #else
240 	/* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
241 	if (c->x86 == 15 && c->x86_cache_alignment == 64)
242 		c->x86_cache_alignment = 128;
243 #endif
244 
245 	/* CPUID workaround for 0F33/0F34 CPU */
246 	if (c->x86 == 0xF && c->x86_model == 0x3
247 	    && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
248 		c->x86_phys_bits = 36;
249 
250 	/*
251 	 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
252 	 * with P/T states and does not stop in deep C-states.
253 	 *
254 	 * It is also reliable across cores and sockets. (but not across
255 	 * cabinets - we turn it off in that case explicitly.)
256 	 */
257 	if (c->x86_power & (1 << 8)) {
258 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
259 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
260 	}
261 
262 	/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
263 	if (c->x86 == 6) {
264 		switch (c->x86_model) {
265 		case INTEL_FAM6_ATOM_SALTWELL_MID:
266 		case INTEL_FAM6_ATOM_SALTWELL_TABLET:
267 		case INTEL_FAM6_ATOM_SILVERMONT_MID:
268 		case INTEL_FAM6_ATOM_AIRMONT_NP:
269 			set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
270 			break;
271 		default:
272 			break;
273 		}
274 	}
275 
276 	/*
277 	 * There is a known erratum on Pentium III and Core Solo
278 	 * and Core Duo CPUs.
279 	 * " Page with PAT set to WC while associated MTRR is UC
280 	 *   may consolidate to UC "
281 	 * Because of this erratum, it is better to stick with
282 	 * setting WC in MTRR rather than using PAT on these CPUs.
283 	 *
284 	 * Enable PAT WC only on P4, Core 2 or later CPUs.
285 	 */
286 	if (c->x86 == 6 && c->x86_model < 15)
287 		clear_cpu_cap(c, X86_FEATURE_PAT);
288 
289 	/*
290 	 * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
291 	 * clear the fast string and enhanced fast string CPU capabilities.
292 	 */
293 	if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
294 		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
295 		if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
296 			pr_info("Disabled fast string operations\n");
297 			setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
298 			setup_clear_cpu_cap(X86_FEATURE_ERMS);
299 		}
300 	}
301 
302 	/*
303 	 * Intel Quark Core DevMan_001.pdf section 6.4.11
304 	 * "The operating system also is required to invalidate (i.e., flush)
305 	 *  the TLB when any changes are made to any of the page table entries.
306 	 *  The operating system must reload CR3 to cause the TLB to be flushed"
307 	 *
308 	 * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h
309 	 * should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE
310 	 * to be modified.
311 	 */
312 	if (c->x86 == 5 && c->x86_model == 9) {
313 		pr_info("Disabling PGE capability bit\n");
314 		setup_clear_cpu_cap(X86_FEATURE_PGE);
315 	}
316 
317 	check_memory_type_self_snoop_errata(c);
318 
319 	/*
320 	 * Get the number of SMT siblings early from the extended topology
321 	 * leaf, if available. Otherwise try the legacy SMT detection.
322 	 */
323 	if (detect_extended_topology_early(c) < 0)
324 		detect_ht_early(c);
325 }
326 
327 static void bsp_init_intel(struct cpuinfo_x86 *c)
328 {
329 	resctrl_cpu_detect(c);
330 }
331 
332 #ifdef CONFIG_X86_32
333 /*
334  *	Early probe support logic for ppro memory erratum #50
335  *
336  *	This is called before we do cpu ident work
337  */
338 
339 int ppro_with_ram_bug(void)
340 {
341 	/* Uses data from early_cpu_detect now */
342 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
343 	    boot_cpu_data.x86 == 6 &&
344 	    boot_cpu_data.x86_model == 1 &&
345 	    boot_cpu_data.x86_stepping < 8) {
346 		pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
347 		return 1;
348 	}
349 	return 0;
350 }
351 
352 static void intel_smp_check(struct cpuinfo_x86 *c)
353 {
354 	/* calling is from identify_secondary_cpu() ? */
355 	if (!c->cpu_index)
356 		return;
357 
358 	/*
359 	 * Mask B, Pentium, but not Pentium MMX
360 	 */
361 	if (c->x86 == 5 &&
362 	    c->x86_stepping >= 1 && c->x86_stepping <= 4 &&
363 	    c->x86_model <= 3) {
364 		/*
365 		 * Remember we have B step Pentia with bugs
366 		 */
367 		WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
368 				    "with B stepping processors.\n");
369 	}
370 }
371 
372 static int forcepae;
373 static int __init forcepae_setup(char *__unused)
374 {
375 	forcepae = 1;
376 	return 1;
377 }
378 __setup("forcepae", forcepae_setup);
379 
380 static void intel_workarounds(struct cpuinfo_x86 *c)
381 {
382 #ifdef CONFIG_X86_F00F_BUG
383 	/*
384 	 * All models of Pentium and Pentium with MMX technology CPUs
385 	 * have the F0 0F bug, which lets nonprivileged users lock up the
386 	 * system. Announce that the fault handler will be checking for it.
387 	 * The Quark is also family 5, but does not have the same bug.
388 	 */
389 	clear_cpu_bug(c, X86_BUG_F00F);
390 	if (c->x86 == 5 && c->x86_model < 9) {
391 		static int f00f_workaround_enabled;
392 
393 		set_cpu_bug(c, X86_BUG_F00F);
394 		if (!f00f_workaround_enabled) {
395 			pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n");
396 			f00f_workaround_enabled = 1;
397 		}
398 	}
399 #endif
400 
401 	/*
402 	 * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
403 	 * model 3 mask 3
404 	 */
405 	if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633)
406 		clear_cpu_cap(c, X86_FEATURE_SEP);
407 
408 	/*
409 	 * PAE CPUID issue: many Pentium M report no PAE but may have a
410 	 * functionally usable PAE implementation.
411 	 * Forcefully enable PAE if kernel parameter "forcepae" is present.
412 	 */
413 	if (forcepae) {
414 		pr_warn("PAE forced!\n");
415 		set_cpu_cap(c, X86_FEATURE_PAE);
416 		add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
417 	}
418 
419 	/*
420 	 * P4 Xeon erratum 037 workaround.
421 	 * Hardware prefetcher may cause stale data to be loaded into the cache.
422 	 */
423 	if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) {
424 		if (msr_set_bit(MSR_IA32_MISC_ENABLE,
425 				MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) {
426 			pr_info("CPU: C0 stepping P4 Xeon detected.\n");
427 			pr_info("CPU: Disabling hardware prefetching (Erratum 037)\n");
428 		}
429 	}
430 
431 	/*
432 	 * See if we have a good local APIC by checking for buggy Pentia,
433 	 * i.e. all B steppings and the C2 stepping of P54C when using their
434 	 * integrated APIC (see 11AP erratum in "Pentium Processor
435 	 * Specification Update").
436 	 */
437 	if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
438 	    (c->x86_stepping < 0x6 || c->x86_stepping == 0xb))
439 		set_cpu_bug(c, X86_BUG_11AP);
440 
441 
442 #ifdef CONFIG_X86_INTEL_USERCOPY
443 	/*
444 	 * Set up the preferred alignment for movsl bulk memory moves
445 	 */
446 	switch (c->x86) {
447 	case 4:		/* 486: untested */
448 		break;
449 	case 5:		/* Old Pentia: untested */
450 		break;
451 	case 6:		/* PII/PIII only like movsl with 8-byte alignment */
452 		movsl_mask.mask = 7;
453 		break;
454 	case 15:	/* P4 is OK down to 8-byte alignment */
455 		movsl_mask.mask = 7;
456 		break;
457 	}
458 #endif
459 
460 	intel_smp_check(c);
461 }
462 #else
463 static void intel_workarounds(struct cpuinfo_x86 *c)
464 {
465 }
466 #endif
467 
468 static void srat_detect_node(struct cpuinfo_x86 *c)
469 {
470 #ifdef CONFIG_NUMA
471 	unsigned node;
472 	int cpu = smp_processor_id();
473 
474 	/* Don't do the funky fallback heuristics the AMD version employs
475 	   for now. */
476 	node = numa_cpu_node(cpu);
477 	if (node == NUMA_NO_NODE || !node_online(node)) {
478 		/* reuse the value from init_cpu_to_node() */
479 		node = cpu_to_node(cpu);
480 	}
481 	numa_set_node(cpu, node);
482 #endif
483 }
484 
485 #define MSR_IA32_TME_ACTIVATE		0x982
486 
487 /* Helpers to access TME_ACTIVATE MSR */
488 #define TME_ACTIVATE_LOCKED(x)		(x & 0x1)
489 #define TME_ACTIVATE_ENABLED(x)		(x & 0x2)
490 
491 #define TME_ACTIVATE_POLICY(x)		((x >> 4) & 0xf)	/* Bits 7:4 */
492 #define TME_ACTIVATE_POLICY_AES_XTS_128	0
493 
494 #define TME_ACTIVATE_KEYID_BITS(x)	((x >> 32) & 0xf)	/* Bits 35:32 */
495 
496 #define TME_ACTIVATE_CRYPTO_ALGS(x)	((x >> 48) & 0xffff)	/* Bits 63:48 */
497 #define TME_ACTIVATE_CRYPTO_AES_XTS_128	1
498 
499 /* Values for mktme_status (SW only construct) */
500 #define MKTME_ENABLED			0
501 #define MKTME_DISABLED			1
502 #define MKTME_UNINITIALIZED		2
503 static int mktme_status = MKTME_UNINITIALIZED;
504 
505 static void detect_tme(struct cpuinfo_x86 *c)
506 {
507 	u64 tme_activate, tme_policy, tme_crypto_algs;
508 	int keyid_bits = 0, nr_keyids = 0;
509 	static u64 tme_activate_cpu0 = 0;
510 
511 	rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate);
512 
513 	if (mktme_status != MKTME_UNINITIALIZED) {
514 		if (tme_activate != tme_activate_cpu0) {
515 			/* Broken BIOS? */
516 			pr_err_once("x86/tme: configuration is inconsistent between CPUs\n");
517 			pr_err_once("x86/tme: MKTME is not usable\n");
518 			mktme_status = MKTME_DISABLED;
519 
520 			/* Proceed. We may need to exclude bits from x86_phys_bits. */
521 		}
522 	} else {
523 		tme_activate_cpu0 = tme_activate;
524 	}
525 
526 	if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
527 		pr_info_once("x86/tme: not enabled by BIOS\n");
528 		mktme_status = MKTME_DISABLED;
529 		return;
530 	}
531 
532 	if (mktme_status != MKTME_UNINITIALIZED)
533 		goto detect_keyid_bits;
534 
535 	pr_info("x86/tme: enabled by BIOS\n");
536 
537 	tme_policy = TME_ACTIVATE_POLICY(tme_activate);
538 	if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128)
539 		pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy);
540 
541 	tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate);
542 	if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) {
543 		pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n",
544 				tme_crypto_algs);
545 		mktme_status = MKTME_DISABLED;
546 	}
547 detect_keyid_bits:
548 	keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
549 	nr_keyids = (1UL << keyid_bits) - 1;
550 	if (nr_keyids) {
551 		pr_info_once("x86/mktme: enabled by BIOS\n");
552 		pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids);
553 	} else {
554 		pr_info_once("x86/mktme: disabled by BIOS\n");
555 	}
556 
557 	if (mktme_status == MKTME_UNINITIALIZED) {
558 		/* MKTME is usable */
559 		mktme_status = MKTME_ENABLED;
560 	}
561 
562 	/*
563 	 * KeyID bits effectively lower the number of physical address
564 	 * bits.  Update cpuinfo_x86::x86_phys_bits accordingly.
565 	 */
566 	c->x86_phys_bits -= keyid_bits;
567 }
568 
569 static void init_cpuid_fault(struct cpuinfo_x86 *c)
570 {
571 	u64 msr;
572 
573 	if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) {
574 		if (msr & MSR_PLATFORM_INFO_CPUID_FAULT)
575 			set_cpu_cap(c, X86_FEATURE_CPUID_FAULT);
576 	}
577 }
578 
579 static void init_intel_misc_features(struct cpuinfo_x86 *c)
580 {
581 	u64 msr;
582 
583 	if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr))
584 		return;
585 
586 	/* Clear all MISC features */
587 	this_cpu_write(msr_misc_features_shadow, 0);
588 
589 	/* Check features and update capabilities and shadow control bits */
590 	init_cpuid_fault(c);
591 	probe_xeon_phi_r3mwait(c);
592 
593 	msr = this_cpu_read(msr_misc_features_shadow);
594 	wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
595 }
596 
597 static void split_lock_init(void);
598 static void bus_lock_init(void);
599 
600 static void init_intel(struct cpuinfo_x86 *c)
601 {
602 	early_init_intel(c);
603 
604 	intel_workarounds(c);
605 
606 	/*
607 	 * Detect the extended topology information if available. This
608 	 * will reinitialise the initial_apicid which will be used
609 	 * in init_intel_cacheinfo()
610 	 */
611 	detect_extended_topology(c);
612 
613 	if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
614 		/*
615 		 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
616 		 * detection.
617 		 */
618 		detect_num_cpu_cores(c);
619 #ifdef CONFIG_X86_32
620 		detect_ht(c);
621 #endif
622 	}
623 
624 	init_intel_cacheinfo(c);
625 
626 	if (c->cpuid_level > 9) {
627 		unsigned eax = cpuid_eax(10);
628 		/* Check for version and the number of counters */
629 		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
630 			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
631 	}
632 
633 	if (cpu_has(c, X86_FEATURE_XMM2))
634 		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
635 
636 	if (boot_cpu_has(X86_FEATURE_DS)) {
637 		unsigned int l1, l2;
638 
639 		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
640 		if (!(l1 & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL))
641 			set_cpu_cap(c, X86_FEATURE_BTS);
642 		if (!(l1 & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL))
643 			set_cpu_cap(c, X86_FEATURE_PEBS);
644 	}
645 
646 	if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) &&
647 	    (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
648 		set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
649 
650 	if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) &&
651 		((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT)))
652 		set_cpu_bug(c, X86_BUG_MONITOR);
653 
654 #ifdef CONFIG_X86_64
655 	if (c->x86 == 15)
656 		c->x86_cache_alignment = c->x86_clflush_size * 2;
657 	if (c->x86 == 6)
658 		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
659 #else
660 	/*
661 	 * Names for the Pentium II/Celeron processors
662 	 * detectable only by also checking the cache size.
663 	 * Dixon is NOT a Celeron.
664 	 */
665 	if (c->x86 == 6) {
666 		unsigned int l2 = c->x86_cache_size;
667 		char *p = NULL;
668 
669 		switch (c->x86_model) {
670 		case 5:
671 			if (l2 == 0)
672 				p = "Celeron (Covington)";
673 			else if (l2 == 256)
674 				p = "Mobile Pentium II (Dixon)";
675 			break;
676 
677 		case 6:
678 			if (l2 == 128)
679 				p = "Celeron (Mendocino)";
680 			else if (c->x86_stepping == 0 || c->x86_stepping == 5)
681 				p = "Celeron-A";
682 			break;
683 
684 		case 8:
685 			if (l2 == 128)
686 				p = "Celeron (Coppermine)";
687 			break;
688 		}
689 
690 		if (p)
691 			strcpy(c->x86_model_id, p);
692 	}
693 
694 	if (c->x86 == 15)
695 		set_cpu_cap(c, X86_FEATURE_P4);
696 	if (c->x86 == 6)
697 		set_cpu_cap(c, X86_FEATURE_P3);
698 #endif
699 
700 	/* Work around errata */
701 	srat_detect_node(c);
702 
703 	init_ia32_feat_ctl(c);
704 
705 	if (cpu_has(c, X86_FEATURE_TME))
706 		detect_tme(c);
707 
708 	init_intel_misc_features(c);
709 
710 	split_lock_init();
711 	bus_lock_init();
712 
713 	intel_init_thermal(c);
714 }
715 
716 #ifdef CONFIG_X86_32
717 static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
718 {
719 	/*
720 	 * Intel PIII Tualatin. This comes in two flavours.
721 	 * One has 256kb of cache, the other 512. We have no way
722 	 * to determine which, so we use a boottime override
723 	 * for the 512kb model, and assume 256 otherwise.
724 	 */
725 	if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
726 		size = 256;
727 
728 	/*
729 	 * Intel Quark SoC X1000 contains a 4-way set associative
730 	 * 16K cache with a 16 byte cache line and 256 lines per tag
731 	 */
732 	if ((c->x86 == 5) && (c->x86_model == 9))
733 		size = 16;
734 	return size;
735 }
736 #endif
737 
738 #define TLB_INST_4K	0x01
739 #define TLB_INST_4M	0x02
740 #define TLB_INST_2M_4M	0x03
741 
742 #define TLB_INST_ALL	0x05
743 #define TLB_INST_1G	0x06
744 
745 #define TLB_DATA_4K	0x11
746 #define TLB_DATA_4M	0x12
747 #define TLB_DATA_2M_4M	0x13
748 #define TLB_DATA_4K_4M	0x14
749 
750 #define TLB_DATA_1G	0x16
751 
752 #define TLB_DATA0_4K	0x21
753 #define TLB_DATA0_4M	0x22
754 #define TLB_DATA0_2M_4M	0x23
755 
756 #define STLB_4K		0x41
757 #define STLB_4K_2M	0x42
758 
759 static const struct _tlb_table intel_tlb_table[] = {
760 	{ 0x01, TLB_INST_4K,		32,	" TLB_INST 4 KByte pages, 4-way set associative" },
761 	{ 0x02, TLB_INST_4M,		2,	" TLB_INST 4 MByte pages, full associative" },
762 	{ 0x03, TLB_DATA_4K,		64,	" TLB_DATA 4 KByte pages, 4-way set associative" },
763 	{ 0x04, TLB_DATA_4M,		8,	" TLB_DATA 4 MByte pages, 4-way set associative" },
764 	{ 0x05, TLB_DATA_4M,		32,	" TLB_DATA 4 MByte pages, 4-way set associative" },
765 	{ 0x0b, TLB_INST_4M,		4,	" TLB_INST 4 MByte pages, 4-way set associative" },
766 	{ 0x4f, TLB_INST_4K,		32,	" TLB_INST 4 KByte pages" },
767 	{ 0x50, TLB_INST_ALL,		64,	" TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
768 	{ 0x51, TLB_INST_ALL,		128,	" TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
769 	{ 0x52, TLB_INST_ALL,		256,	" TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
770 	{ 0x55, TLB_INST_2M_4M,		7,	" TLB_INST 2-MByte or 4-MByte pages, fully associative" },
771 	{ 0x56, TLB_DATA0_4M,		16,	" TLB_DATA0 4 MByte pages, 4-way set associative" },
772 	{ 0x57, TLB_DATA0_4K,		16,	" TLB_DATA0 4 KByte pages, 4-way associative" },
773 	{ 0x59, TLB_DATA0_4K,		16,	" TLB_DATA0 4 KByte pages, fully associative" },
774 	{ 0x5a, TLB_DATA0_2M_4M,	32,	" TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
775 	{ 0x5b, TLB_DATA_4K_4M,		64,	" TLB_DATA 4 KByte and 4 MByte pages" },
776 	{ 0x5c, TLB_DATA_4K_4M,		128,	" TLB_DATA 4 KByte and 4 MByte pages" },
777 	{ 0x5d, TLB_DATA_4K_4M,		256,	" TLB_DATA 4 KByte and 4 MByte pages" },
778 	{ 0x61, TLB_INST_4K,		48,	" TLB_INST 4 KByte pages, full associative" },
779 	{ 0x63, TLB_DATA_1G,		4,	" TLB_DATA 1 GByte pages, 4-way set associative" },
780 	{ 0x6b, TLB_DATA_4K,		256,	" TLB_DATA 4 KByte pages, 8-way associative" },
781 	{ 0x6c, TLB_DATA_2M_4M,		128,	" TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" },
782 	{ 0x6d, TLB_DATA_1G,		16,	" TLB_DATA 1 GByte pages, fully associative" },
783 	{ 0x76, TLB_INST_2M_4M,		8,	" TLB_INST 2-MByte or 4-MByte pages, fully associative" },
784 	{ 0xb0, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 4-way set associative" },
785 	{ 0xb1, TLB_INST_2M_4M,		4,	" TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
786 	{ 0xb2, TLB_INST_4K,		64,	" TLB_INST 4KByte pages, 4-way set associative" },
787 	{ 0xb3, TLB_DATA_4K,		128,	" TLB_DATA 4 KByte pages, 4-way set associative" },
788 	{ 0xb4, TLB_DATA_4K,		256,	" TLB_DATA 4 KByte pages, 4-way associative" },
789 	{ 0xb5, TLB_INST_4K,		64,	" TLB_INST 4 KByte pages, 8-way set associative" },
790 	{ 0xb6, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 8-way set associative" },
791 	{ 0xba, TLB_DATA_4K,		64,	" TLB_DATA 4 KByte pages, 4-way associative" },
792 	{ 0xc0, TLB_DATA_4K_4M,		8,	" TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
793 	{ 0xc1, STLB_4K_2M,		1024,	" STLB 4 KByte and 2 MByte pages, 8-way associative" },
794 	{ 0xc2, TLB_DATA_2M_4M,		16,	" TLB_DATA 2 MByte/4MByte pages, 4-way associative" },
795 	{ 0xca, STLB_4K,		512,	" STLB 4 KByte pages, 4-way associative" },
796 	{ 0x00, 0, 0 }
797 };
798 
799 static void intel_tlb_lookup(const unsigned char desc)
800 {
801 	unsigned char k;
802 	if (desc == 0)
803 		return;
804 
805 	/* look up this descriptor in the table */
806 	for (k = 0; intel_tlb_table[k].descriptor != desc &&
807 	     intel_tlb_table[k].descriptor != 0; k++)
808 		;
809 
810 	if (intel_tlb_table[k].tlb_type == 0)
811 		return;
812 
813 	switch (intel_tlb_table[k].tlb_type) {
814 	case STLB_4K:
815 		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
816 			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
817 		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
818 			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
819 		break;
820 	case STLB_4K_2M:
821 		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
822 			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
823 		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
824 			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
825 		if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
826 			tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
827 		if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
828 			tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
829 		if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
830 			tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
831 		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
832 			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
833 		break;
834 	case TLB_INST_ALL:
835 		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
836 			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
837 		if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
838 			tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
839 		if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
840 			tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
841 		break;
842 	case TLB_INST_4K:
843 		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
844 			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
845 		break;
846 	case TLB_INST_4M:
847 		if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
848 			tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
849 		break;
850 	case TLB_INST_2M_4M:
851 		if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
852 			tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
853 		if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
854 			tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
855 		break;
856 	case TLB_DATA_4K:
857 	case TLB_DATA0_4K:
858 		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
859 			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
860 		break;
861 	case TLB_DATA_4M:
862 	case TLB_DATA0_4M:
863 		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
864 			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
865 		break;
866 	case TLB_DATA_2M_4M:
867 	case TLB_DATA0_2M_4M:
868 		if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
869 			tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
870 		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
871 			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
872 		break;
873 	case TLB_DATA_4K_4M:
874 		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
875 			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
876 		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
877 			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
878 		break;
879 	case TLB_DATA_1G:
880 		if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
881 			tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
882 		break;
883 	}
884 }
885 
886 static void intel_detect_tlb(struct cpuinfo_x86 *c)
887 {
888 	int i, j, n;
889 	unsigned int regs[4];
890 	unsigned char *desc = (unsigned char *)regs;
891 
892 	if (c->cpuid_level < 2)
893 		return;
894 
895 	/* Number of times to iterate */
896 	n = cpuid_eax(2) & 0xFF;
897 
898 	for (i = 0 ; i < n ; i++) {
899 		cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
900 
901 		/* If bit 31 is set, this is an unknown format */
902 		for (j = 0 ; j < 3 ; j++)
903 			if (regs[j] & (1 << 31))
904 				regs[j] = 0;
905 
906 		/* Byte 0 is level count, not a descriptor */
907 		for (j = 1 ; j < 16 ; j++)
908 			intel_tlb_lookup(desc[j]);
909 	}
910 }
911 
912 static const struct cpu_dev intel_cpu_dev = {
913 	.c_vendor	= "Intel",
914 	.c_ident	= { "GenuineIntel" },
915 #ifdef CONFIG_X86_32
916 	.legacy_models = {
917 		{ .family = 4, .model_names =
918 		  {
919 			  [0] = "486 DX-25/33",
920 			  [1] = "486 DX-50",
921 			  [2] = "486 SX",
922 			  [3] = "486 DX/2",
923 			  [4] = "486 SL",
924 			  [5] = "486 SX/2",
925 			  [7] = "486 DX/2-WB",
926 			  [8] = "486 DX/4",
927 			  [9] = "486 DX/4-WB"
928 		  }
929 		},
930 		{ .family = 5, .model_names =
931 		  {
932 			  [0] = "Pentium 60/66 A-step",
933 			  [1] = "Pentium 60/66",
934 			  [2] = "Pentium 75 - 200",
935 			  [3] = "OverDrive PODP5V83",
936 			  [4] = "Pentium MMX",
937 			  [7] = "Mobile Pentium 75 - 200",
938 			  [8] = "Mobile Pentium MMX",
939 			  [9] = "Quark SoC X1000",
940 		  }
941 		},
942 		{ .family = 6, .model_names =
943 		  {
944 			  [0] = "Pentium Pro A-step",
945 			  [1] = "Pentium Pro",
946 			  [3] = "Pentium II (Klamath)",
947 			  [4] = "Pentium II (Deschutes)",
948 			  [5] = "Pentium II (Deschutes)",
949 			  [6] = "Mobile Pentium II",
950 			  [7] = "Pentium III (Katmai)",
951 			  [8] = "Pentium III (Coppermine)",
952 			  [10] = "Pentium III (Cascades)",
953 			  [11] = "Pentium III (Tualatin)",
954 		  }
955 		},
956 		{ .family = 15, .model_names =
957 		  {
958 			  [0] = "Pentium 4 (Unknown)",
959 			  [1] = "Pentium 4 (Willamette)",
960 			  [2] = "Pentium 4 (Northwood)",
961 			  [4] = "Pentium 4 (Foster)",
962 			  [5] = "Pentium 4 (Foster)",
963 		  }
964 		},
965 	},
966 	.legacy_cache_size = intel_size_cache,
967 #endif
968 	.c_detect_tlb	= intel_detect_tlb,
969 	.c_early_init   = early_init_intel,
970 	.c_bsp_init	= bsp_init_intel,
971 	.c_init		= init_intel,
972 	.c_x86_vendor	= X86_VENDOR_INTEL,
973 };
974 
975 cpu_dev_register(intel_cpu_dev);
976 
977 #undef pr_fmt
978 #define pr_fmt(fmt) "x86/split lock detection: " fmt
979 
980 static const struct {
981 	const char			*option;
982 	enum split_lock_detect_state	state;
983 } sld_options[] __initconst = {
984 	{ "off",	sld_off   },
985 	{ "warn",	sld_warn  },
986 	{ "fatal",	sld_fatal },
987 	{ "ratelimit:", sld_ratelimit },
988 };
989 
990 static struct ratelimit_state bld_ratelimit;
991 
992 static unsigned int sysctl_sld_mitigate = 1;
993 static DEFINE_SEMAPHORE(buslock_sem, 1);
994 
995 #ifdef CONFIG_PROC_SYSCTL
996 static struct ctl_table sld_sysctls[] = {
997 	{
998 		.procname       = "split_lock_mitigate",
999 		.data           = &sysctl_sld_mitigate,
1000 		.maxlen         = sizeof(unsigned int),
1001 		.mode           = 0644,
1002 		.proc_handler	= proc_douintvec_minmax,
1003 		.extra1         = SYSCTL_ZERO,
1004 		.extra2         = SYSCTL_ONE,
1005 	},
1006 };
1007 
1008 static int __init sld_mitigate_sysctl_init(void)
1009 {
1010 	register_sysctl_init("kernel", sld_sysctls);
1011 	return 0;
1012 }
1013 
1014 late_initcall(sld_mitigate_sysctl_init);
1015 #endif
1016 
1017 static inline bool match_option(const char *arg, int arglen, const char *opt)
1018 {
1019 	int len = strlen(opt), ratelimit;
1020 
1021 	if (strncmp(arg, opt, len))
1022 		return false;
1023 
1024 	/*
1025 	 * Min ratelimit is 1 bus lock/sec.
1026 	 * Max ratelimit is 1000 bus locks/sec.
1027 	 */
1028 	if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
1029 	    ratelimit > 0 && ratelimit <= 1000) {
1030 		ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
1031 		ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
1032 		return true;
1033 	}
1034 
1035 	return len == arglen;
1036 }
1037 
1038 static bool split_lock_verify_msr(bool on)
1039 {
1040 	u64 ctrl, tmp;
1041 
1042 	if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl))
1043 		return false;
1044 	if (on)
1045 		ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
1046 	else
1047 		ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
1048 	if (wrmsrl_safe(MSR_TEST_CTRL, ctrl))
1049 		return false;
1050 	rdmsrl(MSR_TEST_CTRL, tmp);
1051 	return ctrl == tmp;
1052 }
1053 
1054 static void __init sld_state_setup(void)
1055 {
1056 	enum split_lock_detect_state state = sld_warn;
1057 	char arg[20];
1058 	int i, ret;
1059 
1060 	if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
1061 	    !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
1062 		return;
1063 
1064 	ret = cmdline_find_option(boot_command_line, "split_lock_detect",
1065 				  arg, sizeof(arg));
1066 	if (ret >= 0) {
1067 		for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
1068 			if (match_option(arg, ret, sld_options[i].option)) {
1069 				state = sld_options[i].state;
1070 				break;
1071 			}
1072 		}
1073 	}
1074 	sld_state = state;
1075 }
1076 
1077 static void __init __split_lock_setup(void)
1078 {
1079 	if (!split_lock_verify_msr(false)) {
1080 		pr_info("MSR access failed: Disabled\n");
1081 		return;
1082 	}
1083 
1084 	rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
1085 
1086 	if (!split_lock_verify_msr(true)) {
1087 		pr_info("MSR access failed: Disabled\n");
1088 		return;
1089 	}
1090 
1091 	/* Restore the MSR to its cached value. */
1092 	wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
1093 
1094 	setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
1095 }
1096 
1097 /*
1098  * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking
1099  * is not implemented as one thread could undo the setting of the other
1100  * thread immediately after dropping the lock anyway.
1101  */
1102 static void sld_update_msr(bool on)
1103 {
1104 	u64 test_ctrl_val = msr_test_ctrl_cache;
1105 
1106 	if (on)
1107 		test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
1108 
1109 	wrmsrl(MSR_TEST_CTRL, test_ctrl_val);
1110 }
1111 
1112 static void split_lock_init(void)
1113 {
1114 	/*
1115 	 * #DB for bus lock handles ratelimit and #AC for split lock is
1116 	 * disabled.
1117 	 */
1118 	if (sld_state == sld_ratelimit) {
1119 		split_lock_verify_msr(false);
1120 		return;
1121 	}
1122 
1123 	if (cpu_model_supports_sld)
1124 		split_lock_verify_msr(sld_state != sld_off);
1125 }
1126 
1127 static void __split_lock_reenable_unlock(struct work_struct *work)
1128 {
1129 	sld_update_msr(true);
1130 	up(&buslock_sem);
1131 }
1132 
1133 static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock);
1134 
1135 static void __split_lock_reenable(struct work_struct *work)
1136 {
1137 	sld_update_msr(true);
1138 }
1139 static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable);
1140 
1141 /*
1142  * If a CPU goes offline with pending delayed work to re-enable split lock
1143  * detection then the delayed work will be executed on some other CPU. That
1144  * handles releasing the buslock_sem, but because it executes on a
1145  * different CPU probably won't re-enable split lock detection. This is a
1146  * problem on HT systems since the sibling CPU on the same core may then be
1147  * left running with split lock detection disabled.
1148  *
1149  * Unconditionally re-enable detection here.
1150  */
1151 static int splitlock_cpu_offline(unsigned int cpu)
1152 {
1153 	sld_update_msr(true);
1154 
1155 	return 0;
1156 }
1157 
1158 static void split_lock_warn(unsigned long ip)
1159 {
1160 	struct delayed_work *work;
1161 	int cpu;
1162 
1163 	if (!current->reported_split_lock)
1164 		pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
1165 				    current->comm, current->pid, ip);
1166 	current->reported_split_lock = 1;
1167 
1168 	if (sysctl_sld_mitigate) {
1169 		/*
1170 		 * misery factor #1:
1171 		 * sleep 10ms before trying to execute split lock.
1172 		 */
1173 		if (msleep_interruptible(10) > 0)
1174 			return;
1175 		/*
1176 		 * Misery factor #2:
1177 		 * only allow one buslocked disabled core at a time.
1178 		 */
1179 		if (down_interruptible(&buslock_sem) == -EINTR)
1180 			return;
1181 		work = &sl_reenable_unlock;
1182 	} else {
1183 		work = &sl_reenable;
1184 	}
1185 
1186 	cpu = get_cpu();
1187 	schedule_delayed_work_on(cpu, work, 2);
1188 
1189 	/* Disable split lock detection on this CPU to make progress */
1190 	sld_update_msr(false);
1191 	put_cpu();
1192 }
1193 
1194 bool handle_guest_split_lock(unsigned long ip)
1195 {
1196 	if (sld_state == sld_warn) {
1197 		split_lock_warn(ip);
1198 		return true;
1199 	}
1200 
1201 	pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n",
1202 		     current->comm, current->pid,
1203 		     sld_state == sld_fatal ? "fatal" : "bogus", ip);
1204 
1205 	current->thread.error_code = 0;
1206 	current->thread.trap_nr = X86_TRAP_AC;
1207 	force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
1208 	return false;
1209 }
1210 EXPORT_SYMBOL_GPL(handle_guest_split_lock);
1211 
1212 static void bus_lock_init(void)
1213 {
1214 	u64 val;
1215 
1216 	if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
1217 		return;
1218 
1219 	rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
1220 
1221 	if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
1222 	    (sld_state == sld_warn || sld_state == sld_fatal)) ||
1223 	    sld_state == sld_off) {
1224 		/*
1225 		 * Warn and fatal are handled by #AC for split lock if #AC for
1226 		 * split lock is supported.
1227 		 */
1228 		val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
1229 	} else {
1230 		val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
1231 	}
1232 
1233 	wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
1234 }
1235 
1236 bool handle_user_split_lock(struct pt_regs *regs, long error_code)
1237 {
1238 	if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
1239 		return false;
1240 	split_lock_warn(regs->ip);
1241 	return true;
1242 }
1243 
1244 void handle_bus_lock(struct pt_regs *regs)
1245 {
1246 	switch (sld_state) {
1247 	case sld_off:
1248 		break;
1249 	case sld_ratelimit:
1250 		/* Enforce no more than bld_ratelimit bus locks/sec. */
1251 		while (!__ratelimit(&bld_ratelimit))
1252 			msleep(20);
1253 		/* Warn on the bus lock. */
1254 		fallthrough;
1255 	case sld_warn:
1256 		pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
1257 				    current->comm, current->pid, regs->ip);
1258 		break;
1259 	case sld_fatal:
1260 		force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
1261 		break;
1262 	}
1263 }
1264 
1265 /*
1266  * CPU models that are known to have the per-core split-lock detection
1267  * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
1268  */
1269 static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
1270 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,	0),
1271 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,	0),
1272 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,	0),
1273 	{}
1274 };
1275 
1276 static void __init split_lock_setup(struct cpuinfo_x86 *c)
1277 {
1278 	const struct x86_cpu_id *m;
1279 	u64 ia32_core_caps;
1280 
1281 	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
1282 		return;
1283 
1284 	/* Check for CPUs that have support but do not enumerate it: */
1285 	m = x86_match_cpu(split_lock_cpu_ids);
1286 	if (m)
1287 		goto supported;
1288 
1289 	if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
1290 		return;
1291 
1292 	/*
1293 	 * Not all bits in MSR_IA32_CORE_CAPS are architectural, but
1294 	 * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is.  All CPUs that set
1295 	 * it have split lock detection.
1296 	 */
1297 	rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
1298 	if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
1299 		goto supported;
1300 
1301 	/* CPU is not in the model list and does not have the MSR bit: */
1302 	return;
1303 
1304 supported:
1305 	cpu_model_supports_sld = true;
1306 	__split_lock_setup();
1307 }
1308 
1309 static void sld_state_show(void)
1310 {
1311 	if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
1312 	    !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
1313 		return;
1314 
1315 	switch (sld_state) {
1316 	case sld_off:
1317 		pr_info("disabled\n");
1318 		break;
1319 	case sld_warn:
1320 		if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
1321 			pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
1322 			if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
1323 					      "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
1324 				pr_warn("No splitlock CPU offline handler\n");
1325 		} else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
1326 			pr_info("#DB: warning on user-space bus_locks\n");
1327 		}
1328 		break;
1329 	case sld_fatal:
1330 		if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
1331 			pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n");
1332 		} else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
1333 			pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n",
1334 				boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ?
1335 				" from non-WB" : "");
1336 		}
1337 		break;
1338 	case sld_ratelimit:
1339 		if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
1340 			pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
1341 		break;
1342 	}
1343 }
1344 
1345 void __init sld_setup(struct cpuinfo_x86 *c)
1346 {
1347 	split_lock_setup(c);
1348 	sld_state_setup();
1349 	sld_state_show();
1350 }
1351 
1352 #define X86_HYBRID_CPU_TYPE_ID_SHIFT	24
1353 
1354 /**
1355  * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU
1356  *
1357  * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in
1358  * a hybrid processor. If the processor is not hybrid, returns 0.
1359  */
1360 u8 get_this_hybrid_cpu_type(void)
1361 {
1362 	if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
1363 		return 0;
1364 
1365 	return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT;
1366 }
1367