1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * x86 SMP booting functions
4 *
5 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
6 * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
7 * Copyright 2001 Andi Kleen, SuSE Labs.
8 *
9 * Much of the core SMP work is based on previous work by Thomas Radke, to
10 * whom a great many thanks are extended.
11 *
12 * Thanks to Intel for making available several different Pentium,
13 * Pentium Pro and Pentium-II/Xeon MP machines.
14 * Original development of Linux SMP code supported by Caldera.
15 *
16 * Fixes
17 * Felix Koop : NR_CPUS used properly
18 * Jose Renau : Handle single CPU case.
19 * Alan Cox : By repeated request 8) - Total BogoMIPS report.
20 * Greg Wright : Fix for kernel stacks panic.
21 * Erich Boleyn : MP v1.4 and additional changes.
22 * Matthias Sattler : Changes for 2.1 kernel map.
23 * Michel Lespinasse : Changes for 2.1 kernel map.
24 * Michael Chastain : Change trampoline.S to gnu as.
25 * Alan Cox : Dumb bug: 'B' step PPro's are fine
26 * Ingo Molnar : Added APIC timers, based on code
27 * from Jose Renau
28 * Ingo Molnar : various cleanups and rewrites
29 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
30 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
31 * Andi Kleen : Changed for SMP boot into long mode.
32 * Martin J. Bligh : Added support for multi-quad systems
33 * Dave Jones : Report invalid combinations of Athlon CPUs.
34 * Rusty Russell : Hacked into shape for new "hotplug" boot process.
35 * Andi Kleen : Converted to new state machine.
36 * Ashok Raj : CPU hotplug support
37 * Glauber Costa : i386 and x86_64 integration
38 */
39
40 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
41
42 #include <linux/init.h>
43 #include <linux/smp.h>
44 #include <linux/export.h>
45 #include <linux/sched.h>
46 #include <linux/sched/topology.h>
47 #include <linux/sched/hotplug.h>
48 #include <linux/sched/task_stack.h>
49 #include <linux/percpu.h>
50 #include <linux/memblock.h>
51 #include <linux/err.h>
52 #include <linux/nmi.h>
53 #include <linux/tboot.h>
54 #include <linux/gfp.h>
55 #include <linux/cpuidle.h>
56 #include <linux/kexec.h>
57 #include <linux/numa.h>
58 #include <linux/pgtable.h>
59 #include <linux/overflow.h>
60 #include <linux/stackprotector.h>
61 #include <linux/cpuhotplug.h>
62 #include <linux/mc146818rtc.h>
63 #include <linux/acpi.h>
64
65 #include <asm/acpi.h>
66 #include <asm/cacheinfo.h>
67 #include <asm/cpuid/api.h>
68 #include <asm/desc.h>
69 #include <asm/nmi.h>
70 #include <asm/irq.h>
71 #include <asm/realmode.h>
72 #include <asm/cpu.h>
73 #include <asm/numa.h>
74 #include <asm/tlbflush.h>
75 #include <asm/mtrr.h>
76 #include <asm/mwait.h>
77 #include <asm/apic.h>
78 #include <asm/io_apic.h>
79 #include <asm/fpu/api.h>
80 #include <asm/setup.h>
81 #include <asm/uv/uv.h>
82 #include <asm/microcode.h>
83 #include <asm/i8259.h>
84 #include <asm/misc.h>
85 #include <asm/qspinlock.h>
86 #include <asm/intel-family.h>
87 #include <asm/cpu_device_id.h>
88 #include <asm/spec-ctrl.h>
89 #include <asm/hw_irq.h>
90 #include <asm/stackprotector.h>
91 #include <asm/sev.h>
92 #include <asm/spec-ctrl.h>
93
94 /* representing HT siblings of each logical CPU */
95 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
96 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
97
98 /* representing HT and core siblings of each logical CPU */
99 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
100 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
101
102 /* representing HT, core, and die siblings of each logical CPU */
103 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
104 EXPORT_PER_CPU_SYMBOL(cpu_die_map);
105
106 /* Representing CPUs for which sibling maps can be computed */
107 static cpumask_var_t cpu_sibling_setup_mask;
108
109 struct mwait_cpu_dead {
110 unsigned int control;
111 unsigned int status;
112 };
113
114 #define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
115 #define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
116
117 /*
118 * Cache line aligned data for mwait_play_dead(). Separate on purpose so
119 * that it's unlikely to be touched by other CPUs.
120 */
121 static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
122
123 /* Maximum number of SMT threads on any online core */
124 int __read_mostly __max_smt_threads = 1;
125
126 /* Flag to indicate if a complete sched domain rebuild is required */
127 bool x86_topology_update;
128
arch_update_cpu_topology(void)129 int arch_update_cpu_topology(void)
130 {
131 int retval = x86_topology_update;
132
133 x86_topology_update = false;
134 return retval;
135 }
136
137 static unsigned int smpboot_warm_reset_vector_count;
138
smpboot_setup_warm_reset_vector(unsigned long start_eip)139 static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
140 {
141 unsigned long flags;
142
143 spin_lock_irqsave(&rtc_lock, flags);
144 if (!smpboot_warm_reset_vector_count++) {
145 CMOS_WRITE(0xa, 0xf);
146 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> 4;
147 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & 0xf;
148 }
149 spin_unlock_irqrestore(&rtc_lock, flags);
150 }
151
smpboot_restore_warm_reset_vector(void)152 static inline void smpboot_restore_warm_reset_vector(void)
153 {
154 unsigned long flags;
155
156 /*
157 * Paranoid: Set warm reset code and vector here back
158 * to default values.
159 */
160 spin_lock_irqsave(&rtc_lock, flags);
161 if (!--smpboot_warm_reset_vector_count) {
162 CMOS_WRITE(0, 0xf);
163 *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
164 }
165 spin_unlock_irqrestore(&rtc_lock, flags);
166
167 }
168
169 /* Run the next set of setup steps for the upcoming CPU */
ap_starting(void)170 static void ap_starting(void)
171 {
172 int cpuid = smp_processor_id();
173
174 /* Mop up eventual mwait_play_dead() wreckage */
175 this_cpu_write(mwait_cpu_dead.status, 0);
176 this_cpu_write(mwait_cpu_dead.control, 0);
177
178 /*
179 * If woken up by an INIT in an 82489DX configuration the alive
180 * synchronization guarantees that the CPU does not reach this
181 * point before an INIT_deassert IPI reaches the local APIC, so it
182 * is now safe to touch the local APIC.
183 *
184 * Set up this CPU, first the APIC, which is probably redundant on
185 * most boards.
186 */
187 apic_ap_setup();
188
189 /* Save the processor parameters. */
190 identify_secondary_cpu(cpuid);
191
192 /*
193 * The topology information must be up to date before
194 * notify_cpu_starting().
195 */
196 set_cpu_sibling_map(cpuid);
197
198 ap_init_aperfmperf();
199
200 pr_debug("Stack at about %p\n", &cpuid);
201
202 wmb();
203
204 /*
205 * This runs the AP through all the cpuhp states to its target
206 * state CPUHP_ONLINE.
207 */
208 notify_cpu_starting(cpuid);
209 }
210
ap_calibrate_delay(void)211 static void ap_calibrate_delay(void)
212 {
213 /*
214 * Calibrate the delay loop and update loops_per_jiffy in cpu_data.
215 * identify_secondary_cpu() stored a value that is close but not as
216 * accurate as the value just calculated.
217 *
218 * As this is invoked after the TSC synchronization check,
219 * calibrate_delay_is_known() will skip the calibration routine
220 * when TSC is synchronized across sockets.
221 */
222 calibrate_delay();
223 cpu_data(smp_processor_id()).loops_per_jiffy = loops_per_jiffy;
224 }
225
226 /*
227 * Activate a secondary processor.
228 */
start_secondary(void * unused)229 static void notrace __noendbr start_secondary(void *unused)
230 {
231 /*
232 * Don't put *anything* except direct CPU state initialization
233 * before cpu_init(), SMP booting is too fragile that we want to
234 * limit the things done here to the most necessary things.
235 */
236 cr4_init();
237
238 /*
239 * 32-bit specific. 64-bit reaches this code with the correct page
240 * table established. Yet another historical divergence.
241 */
242 if (IS_ENABLED(CONFIG_X86_32)) {
243 /* switch away from the initial page table */
244 load_cr3(swapper_pg_dir);
245 __flush_tlb_all();
246 }
247
248 cpu_init_exception_handling(false);
249
250 /*
251 * Load the microcode before reaching the AP alive synchronization
252 * point below so it is not part of the full per CPU serialized
253 * bringup part when "parallel" bringup is enabled.
254 *
255 * That's even safe when hyperthreading is enabled in the CPU as
256 * the core code starts the primary threads first and leaves the
257 * secondary threads waiting for SIPI. Loading microcode on
258 * physical cores concurrently is a safe operation.
259 *
260 * This covers both the Intel specific issue that concurrent
261 * microcode loading on SMT siblings must be prohibited and the
262 * vendor independent issue`that microcode loading which changes
263 * CPUID, MSRs etc. must be strictly serialized to maintain
264 * software state correctness.
265 */
266 load_ucode_ap();
267
268 /*
269 * Synchronization point with the hotplug core. Sets this CPUs
270 * synchronization state to ALIVE and spin-waits for the control CPU to
271 * release this CPU for further bringup.
272 */
273 cpuhp_ap_sync_alive();
274
275 cpu_init();
276 fpu__init_cpu();
277 rcutree_report_cpu_starting(raw_smp_processor_id());
278 x86_cpuinit.early_percpu_clock_init();
279
280 ap_starting();
281
282 /* Check TSC synchronization with the control CPU. */
283 check_tsc_sync_target();
284
285 /*
286 * Calibrate the delay loop after the TSC synchronization check.
287 * This allows to skip the calibration when TSC is synchronized
288 * across sockets.
289 */
290 ap_calibrate_delay();
291
292 speculative_store_bypass_ht_init();
293
294 /*
295 * Lock vector_lock, set CPU online and bring the vector
296 * allocator online. Online must be set with vector_lock held
297 * to prevent a concurrent irq setup/teardown from seeing a
298 * half valid vector space.
299 */
300 lock_vector_lock();
301 set_cpu_online(smp_processor_id(), true);
302 lapic_online();
303 unlock_vector_lock();
304 x86_platform.nmi_init();
305
306 /* enable local interrupts */
307 local_irq_enable();
308
309 x86_cpuinit.setup_percpu_clockev();
310
311 wmb();
312 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
313 }
314 ANNOTATE_NOENDBR_SYM(start_secondary);
315
316 static bool
topology_same_node(struct cpuinfo_x86 * c,struct cpuinfo_x86 * o)317 topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
318 {
319 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
320
321 return (cpu_to_node(cpu1) == cpu_to_node(cpu2));
322 }
323
324 static bool
topology_sane(struct cpuinfo_x86 * c,struct cpuinfo_x86 * o,const char * name)325 topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
326 {
327 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
328
329 return !WARN_ONCE(!topology_same_node(c, o),
330 "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
331 "[node: %d != %d]. Ignoring dependency.\n",
332 cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
333 }
334
335 #define link_mask(mfunc, c1, c2) \
336 do { \
337 cpumask_set_cpu((c1), mfunc(c2)); \
338 cpumask_set_cpu((c2), mfunc(c1)); \
339 } while (0)
340
match_smt(struct cpuinfo_x86 * c,struct cpuinfo_x86 * o)341 static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
342 {
343 if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
344 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
345
346 if (c->topo.pkg_id == o->topo.pkg_id &&
347 c->topo.die_id == o->topo.die_id &&
348 c->topo.amd_node_id == o->topo.amd_node_id &&
349 per_cpu_llc_id(cpu1) == per_cpu_llc_id(cpu2)) {
350 if (c->topo.core_id == o->topo.core_id)
351 return topology_sane(c, o, "smt");
352
353 if ((c->topo.cu_id != 0xff) &&
354 (o->topo.cu_id != 0xff) &&
355 (c->topo.cu_id == o->topo.cu_id))
356 return topology_sane(c, o, "smt");
357 }
358
359 } else if (c->topo.pkg_id == o->topo.pkg_id &&
360 c->topo.die_id == o->topo.die_id &&
361 c->topo.core_id == o->topo.core_id) {
362 return topology_sane(c, o, "smt");
363 }
364
365 return false;
366 }
367
match_die(struct cpuinfo_x86 * c,struct cpuinfo_x86 * o)368 static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
369 {
370 if (c->topo.pkg_id != o->topo.pkg_id || c->topo.die_id != o->topo.die_id)
371 return false;
372
373 if (cpu_feature_enabled(X86_FEATURE_TOPOEXT) && topology_amd_nodes_per_pkg() > 1)
374 return c->topo.amd_node_id == o->topo.amd_node_id;
375
376 return true;
377 }
378
match_l2c(struct cpuinfo_x86 * c,struct cpuinfo_x86 * o)379 static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
380 {
381 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
382
383 /* If the arch didn't set up l2c_id, fall back to SMT */
384 if (per_cpu_l2c_id(cpu1) == BAD_APICID)
385 return match_smt(c, o);
386
387 /* Do not match if L2 cache id does not match: */
388 if (per_cpu_l2c_id(cpu1) != per_cpu_l2c_id(cpu2))
389 return false;
390
391 return topology_sane(c, o, "l2c");
392 }
393
394 /*
395 * Unlike the other levels, we do not enforce keeping a
396 * multicore group inside a NUMA node. If this happens, we will
397 * discard the MC level of the topology later.
398 */
match_pkg(struct cpuinfo_x86 * c,struct cpuinfo_x86 * o)399 static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
400 {
401 if (c->topo.pkg_id == o->topo.pkg_id)
402 return true;
403 return false;
404 }
405
406 /*
407 * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
408 *
409 * Any Intel CPU that has multiple nodes per package and does not
410 * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
411 *
412 * When in SNC mode, these CPUs enumerate an LLC that is shared
413 * by multiple NUMA nodes. The LLC is shared for off-package data
414 * access but private to the NUMA node (half of the package) for
415 * on-package access. CPUID (the source of the information about
416 * the LLC) can only enumerate the cache as shared or unshared,
417 * but not this particular configuration.
418 */
419
420 static const struct x86_cpu_id intel_cod_cpu[] = {
421 X86_MATCH_VFM(INTEL_HASWELL_X, 0), /* COD */
422 X86_MATCH_VFM(INTEL_BROADWELL_X, 0), /* COD */
423 X86_MATCH_VFM(INTEL_ANY, 1), /* SNC */
424 {}
425 };
426
match_llc(struct cpuinfo_x86 * c,struct cpuinfo_x86 * o)427 static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
428 {
429 const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
430 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
431 bool intel_snc = id && id->driver_data;
432
433 /* Do not match if we do not have a valid APICID for cpu: */
434 if (per_cpu_llc_id(cpu1) == BAD_APICID)
435 return false;
436
437 /* Do not match if LLC id does not match: */
438 if (per_cpu_llc_id(cpu1) != per_cpu_llc_id(cpu2))
439 return false;
440
441 /*
442 * Allow the SNC topology without warning. Return of false
443 * means 'c' does not share the LLC of 'o'. This will be
444 * reflected to userspace.
445 */
446 if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
447 return false;
448
449 return topology_sane(c, o, "llc");
450 }
451
452
x86_sched_itmt_flags(void)453 static inline int x86_sched_itmt_flags(void)
454 {
455 return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
456 }
457
458 #ifdef CONFIG_SCHED_MC
x86_core_flags(void)459 static int x86_core_flags(void)
460 {
461 return cpu_core_flags() | x86_sched_itmt_flags();
462 }
463 #endif
464 #ifdef CONFIG_SCHED_CLUSTER
x86_cluster_flags(void)465 static int x86_cluster_flags(void)
466 {
467 return cpu_cluster_flags() | x86_sched_itmt_flags();
468 }
469 #endif
470
471 /*
472 * Set if a package/die has multiple NUMA nodes inside.
473 * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
474 * Sub-NUMA Clustering have this.
475 */
476 static bool x86_has_numa_in_package;
477
478 static struct sched_domain_topology_level x86_topology[] = {
479 SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
480 #ifdef CONFIG_SCHED_CLUSTER
481 SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS),
482 #endif
483 #ifdef CONFIG_SCHED_MC
484 SDTL_INIT(tl_mc_mask, x86_core_flags, MC),
485 #endif
486 SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG),
487 { NULL },
488 };
489
build_sched_topology(void)490 static void __init build_sched_topology(void)
491 {
492 struct sched_domain_topology_level *topology = x86_topology;
493
494 /*
495 * When there is NUMA topology inside the package invalidate the
496 * PKG domain since the NUMA domains will auto-magically create the
497 * right spanning domains based on the SLIT.
498 */
499 if (x86_has_numa_in_package) {
500 unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
501
502 memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
503 }
504
505 /*
506 * Drop the SMT domains if there is only one thread per-core
507 * since it'll get degenerated by the scheduler anyways.
508 */
509 if (cpu_smt_num_threads <= 1)
510 ++topology;
511
512 set_sched_topology(topology);
513 }
514
515 #ifdef CONFIG_NUMA
516 static int sched_avg_remote_distance;
avg_remote_numa_distance(void)517 static int avg_remote_numa_distance(void)
518 {
519 int i, j;
520 int distance, nr_remote, total_distance;
521
522 if (sched_avg_remote_distance > 0)
523 return sched_avg_remote_distance;
524
525 nr_remote = 0;
526 total_distance = 0;
527 for_each_node_state(i, N_CPU) {
528 for_each_node_state(j, N_CPU) {
529 distance = node_distance(i, j);
530
531 if (distance >= REMOTE_DISTANCE) {
532 nr_remote++;
533 total_distance += distance;
534 }
535 }
536 }
537 if (nr_remote)
538 sched_avg_remote_distance = total_distance / nr_remote;
539 else
540 sched_avg_remote_distance = REMOTE_DISTANCE;
541
542 return sched_avg_remote_distance;
543 }
544
arch_sched_node_distance(int from,int to)545 int arch_sched_node_distance(int from, int to)
546 {
547 int d = node_distance(from, to);
548
549 switch (boot_cpu_data.x86_vfm) {
550 case INTEL_GRANITERAPIDS_X:
551 case INTEL_ATOM_DARKMONT_X:
552
553 if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
554 d < REMOTE_DISTANCE)
555 return d;
556
557 /*
558 * With SNC enabled, there could be too many levels of remote
559 * NUMA node distances, creating NUMA domain levels
560 * including local nodes and partial remote nodes.
561 *
562 * Trim finer distance tuning for NUMA nodes in remote package
563 * for the purpose of building sched domains. Group NUMA nodes
564 * in the remote package in the same sched group.
565 * Simplify NUMA domains and avoid extra NUMA levels including
566 * different remote NUMA nodes and local nodes.
567 *
568 * GNR and CWF don't expect systems with more than 2 packages
569 * and more than 2 hops between packages. Single average remote
570 * distance won't be appropriate if there are more than 2
571 * packages as average distance to different remote packages
572 * could be different.
573 */
574 WARN_ONCE(topology_max_packages() > 2,
575 "sched: Expect only up to 2 packages for GNR or CWF, "
576 "but saw %d packages when building sched domains.",
577 topology_max_packages());
578
579 d = avg_remote_numa_distance();
580 }
581 return d;
582 }
583 #endif /* CONFIG_NUMA */
584
set_cpu_sibling_map(int cpu)585 void set_cpu_sibling_map(int cpu)
586 {
587 bool has_smt = __max_threads_per_core > 1;
588 bool has_mp = has_smt || topology_num_cores_per_package() > 1;
589 struct cpuinfo_x86 *c = &cpu_data(cpu);
590 struct cpuinfo_x86 *o;
591 int i, threads;
592
593 cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
594
595 if (!has_mp) {
596 cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
597 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
598 cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu));
599 cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
600 cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
601 c->booted_cores = 1;
602 return;
603 }
604
605 for_each_cpu(i, cpu_sibling_setup_mask) {
606 o = &cpu_data(i);
607
608 if (match_pkg(c, o) && !topology_same_node(c, o))
609 x86_has_numa_in_package = true;
610
611 if ((i == cpu) || (has_smt && match_smt(c, o)))
612 link_mask(topology_sibling_cpumask, cpu, i);
613
614 if ((i == cpu) || (has_mp && match_llc(c, o)))
615 link_mask(cpu_llc_shared_mask, cpu, i);
616
617 if ((i == cpu) || (has_mp && match_l2c(c, o)))
618 link_mask(cpu_l2c_shared_mask, cpu, i);
619
620 if ((i == cpu) || (has_mp && match_die(c, o)))
621 link_mask(topology_die_cpumask, cpu, i);
622 }
623
624 threads = cpumask_weight(topology_sibling_cpumask(cpu));
625 if (threads > __max_smt_threads)
626 __max_smt_threads = threads;
627
628 for_each_cpu(i, topology_sibling_cpumask(cpu))
629 cpu_data(i).smt_active = threads > 1;
630
631 /*
632 * This needs a separate iteration over the cpus because we rely on all
633 * topology_sibling_cpumask links to be set-up.
634 */
635 for_each_cpu(i, cpu_sibling_setup_mask) {
636 o = &cpu_data(i);
637
638 if ((i == cpu) || (has_mp && match_pkg(c, o))) {
639 link_mask(topology_core_cpumask, cpu, i);
640
641 /*
642 * Does this new cpu bringup a new core?
643 */
644 if (threads == 1) {
645 /*
646 * for each core in package, increment
647 * the booted_cores for this new cpu
648 */
649 if (cpumask_first(
650 topology_sibling_cpumask(i)) == i)
651 c->booted_cores++;
652 /*
653 * increment the core count for all
654 * the other cpus in this package
655 */
656 if (i != cpu)
657 cpu_data(i).booted_cores++;
658 } else if (i != cpu && !c->booted_cores)
659 c->booted_cores = cpu_data(i).booted_cores;
660 }
661 }
662 }
663
664 /* maps the cpu to the sched domain representing multi-core */
cpu_coregroup_mask(int cpu)665 const struct cpumask *cpu_coregroup_mask(int cpu)
666 {
667 return cpu_llc_shared_mask(cpu);
668 }
669
cpu_clustergroup_mask(int cpu)670 const struct cpumask *cpu_clustergroup_mask(int cpu)
671 {
672 return cpu_l2c_shared_mask(cpu);
673 }
674 EXPORT_SYMBOL_GPL(cpu_clustergroup_mask);
675
impress_friends(void)676 static void impress_friends(void)
677 {
678 int cpu;
679 unsigned long bogosum = 0;
680 /*
681 * Allow the user to impress friends.
682 */
683 pr_debug("Before bogomips\n");
684 for_each_online_cpu(cpu)
685 bogosum += cpu_data(cpu).loops_per_jiffy;
686
687 pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
688 num_online_cpus(),
689 bogosum/(500000/HZ),
690 (bogosum/(5000/HZ))%100);
691
692 pr_debug("Before bogocount - setting activated=1\n");
693 }
694
695 /*
696 * The Multiprocessor Specification 1.4 (1997) example code suggests
697 * that there should be a 10ms delay between the BSP asserting INIT
698 * and de-asserting INIT, when starting a remote processor.
699 * But that slows boot and resume on modern processors, which include
700 * many cores and don't require that delay.
701 *
702 * Cmdline "cpu_init_udelay=" is available to override this delay.
703 */
704 #define UDELAY_10MS_LEGACY 10000
705
706 static unsigned int init_udelay = UINT_MAX;
707
cpu_init_udelay(char * str)708 static int __init cpu_init_udelay(char *str)
709 {
710 get_option(&str, &init_udelay);
711
712 return 0;
713 }
714 early_param("cpu_init_udelay", cpu_init_udelay);
715
smp_set_init_udelay(void)716 static void __init smp_set_init_udelay(void)
717 {
718 /* if cmdline changed it from default, leave it alone */
719 if (init_udelay != UINT_MAX)
720 return;
721
722 /* if modern processor, use no delay */
723 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO) ||
724 (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON && boot_cpu_data.x86 >= 0x18) ||
725 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0xF)) {
726 init_udelay = 0;
727 return;
728 }
729 /* else, use legacy delay */
730 init_udelay = UDELAY_10MS_LEGACY;
731 }
732
733 /*
734 * Wake up AP by INIT, INIT, STARTUP sequence.
735 */
send_init_sequence(u32 phys_apicid)736 static void send_init_sequence(u32 phys_apicid)
737 {
738 int maxlvt = lapic_get_maxlvt();
739
740 /* Be paranoid about clearing APIC errors. */
741 if (APIC_INTEGRATED(boot_cpu_apic_version)) {
742 /* Due to the Pentium erratum 3AP. */
743 if (maxlvt > 3)
744 apic_write(APIC_ESR, 0);
745 apic_read(APIC_ESR);
746 }
747
748 /* Assert INIT on the target CPU */
749 apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid);
750 safe_apic_wait_icr_idle();
751
752 udelay(init_udelay);
753
754 /* Deassert INIT on the target CPU */
755 apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
756 safe_apic_wait_icr_idle();
757 }
758
759 /*
760 * Wake up AP by INIT, INIT, STARTUP sequence.
761 */
wakeup_secondary_cpu_via_init(u32 phys_apicid,unsigned long start_eip,unsigned int cpu)762 static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip, unsigned int cpu)
763 {
764 unsigned long send_status = 0, accept_status = 0;
765 int num_starts, j, maxlvt;
766
767 preempt_disable();
768 maxlvt = lapic_get_maxlvt();
769 send_init_sequence(phys_apicid);
770
771 mb();
772
773 /*
774 * Should we send STARTUP IPIs ?
775 *
776 * Determine this based on the APIC version.
777 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
778 */
779 if (APIC_INTEGRATED(boot_cpu_apic_version))
780 num_starts = 2;
781 else
782 num_starts = 0;
783
784 /*
785 * Run STARTUP IPI loop.
786 */
787 pr_debug("#startup loops: %d\n", num_starts);
788
789 for (j = 1; j <= num_starts; j++) {
790 pr_debug("Sending STARTUP #%d\n", j);
791 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
792 apic_write(APIC_ESR, 0);
793 apic_read(APIC_ESR);
794 pr_debug("After apic_write\n");
795
796 /*
797 * STARTUP IPI
798 */
799
800 /* Target chip */
801 /* Boot on the stack */
802 /* Kick the second */
803 apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
804 phys_apicid);
805
806 /*
807 * Give the other CPU some time to accept the IPI.
808 */
809 if (init_udelay == 0)
810 udelay(10);
811 else
812 udelay(300);
813
814 pr_debug("Startup point 1\n");
815
816 pr_debug("Waiting for send to finish...\n");
817 send_status = safe_apic_wait_icr_idle();
818
819 /*
820 * Give the other CPU some time to accept the IPI.
821 */
822 if (init_udelay == 0)
823 udelay(10);
824 else
825 udelay(200);
826
827 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
828 apic_write(APIC_ESR, 0);
829 accept_status = (apic_read(APIC_ESR) & 0xEF);
830 if (send_status || accept_status)
831 break;
832 }
833 pr_debug("After Startup\n");
834
835 if (send_status)
836 pr_err("APIC never delivered???\n");
837 if (accept_status)
838 pr_err("APIC delivery error (%lx)\n", accept_status);
839
840 preempt_enable();
841 return (send_status | accept_status);
842 }
843
844 /* reduce the number of lines printed when booting a large cpu count system */
announce_cpu(int cpu,int apicid)845 static void announce_cpu(int cpu, int apicid)
846 {
847 static int width, node_width, first = 1;
848 static int current_node = NUMA_NO_NODE;
849 int node = early_cpu_to_node(cpu);
850
851 if (!width)
852 width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */
853
854 if (!node_width)
855 node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */
856
857 if (system_state < SYSTEM_RUNNING) {
858 if (first)
859 pr_info("x86: Booting SMP configuration:\n");
860
861 if (node != current_node) {
862 if (current_node > (-1))
863 pr_cont("\n");
864 current_node = node;
865
866 printk(KERN_INFO ".... node %*s#%d, CPUs: ",
867 node_width - num_digits(node), " ", node);
868 }
869
870 /* Add padding for the BSP */
871 if (first)
872 pr_cont("%*s", width + 1, " ");
873 first = 0;
874
875 pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu);
876 } else
877 pr_info("Booting Node %d Processor %d APIC 0x%x\n",
878 node, cpu, apicid);
879 }
880
common_cpu_up(unsigned int cpu,struct task_struct * idle)881 int common_cpu_up(unsigned int cpu, struct task_struct *idle)
882 {
883 int ret;
884
885 /* Just in case we booted with a single CPU. */
886 alternatives_enable_smp();
887
888 per_cpu(current_task, cpu) = idle;
889 cpu_init_stack_canary(cpu, idle);
890
891 /* Initialize the interrupt stack(s) */
892 ret = irq_init_percpu_irqstack(cpu);
893 if (ret)
894 return ret;
895
896 #ifdef CONFIG_X86_32
897 /* Stack for startup_32 can be just as for start_secondary onwards */
898 per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
899 #endif
900 return 0;
901 }
902
903 /*
904 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
905 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
906 * Returns zero if startup was successfully sent, else error code from
907 * ->wakeup_secondary_cpu.
908 */
do_boot_cpu(u32 apicid,unsigned int cpu,struct task_struct * idle)909 static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle)
910 {
911 unsigned long start_ip = real_mode_header->trampoline_start;
912 int ret;
913
914 #ifdef CONFIG_X86_64
915 /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
916 if (apic->wakeup_secondary_cpu_64)
917 start_ip = real_mode_header->trampoline_start64;
918 #endif
919 idle->thread.sp = (unsigned long)task_pt_regs(idle);
920 initial_code = (unsigned long)start_secondary;
921
922 if (IS_ENABLED(CONFIG_X86_32)) {
923 early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
924 initial_stack = idle->thread.sp;
925 } else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) {
926 smpboot_control = cpu;
927 }
928
929 /* Enable the espfix hack for this CPU */
930 init_espfix_ap(cpu);
931
932 /* So we see what's up */
933 announce_cpu(cpu, apicid);
934
935 /*
936 * This grunge runs the startup process for
937 * the targeted processor.
938 */
939 if (x86_platform.legacy.warm_reset) {
940
941 pr_debug("Setting warm reset code and vector.\n");
942
943 smpboot_setup_warm_reset_vector(start_ip);
944 /*
945 * Be paranoid about clearing APIC errors.
946 */
947 if (APIC_INTEGRATED(boot_cpu_apic_version)) {
948 apic_write(APIC_ESR, 0);
949 apic_read(APIC_ESR);
950 }
951 }
952
953 smp_mb();
954
955 /*
956 * Wake up a CPU in difference cases:
957 * - Use a method from the APIC driver if one defined, with wakeup
958 * straight to 64-bit mode preferred over wakeup to RM.
959 * Otherwise,
960 * - Use an INIT boot APIC message
961 */
962 if (apic->wakeup_secondary_cpu_64)
963 ret = apic->wakeup_secondary_cpu_64(apicid, start_ip, cpu);
964 else if (apic->wakeup_secondary_cpu)
965 ret = apic->wakeup_secondary_cpu(apicid, start_ip, cpu);
966 else
967 ret = wakeup_secondary_cpu_via_init(apicid, start_ip, cpu);
968
969 /* If the wakeup mechanism failed, cleanup the warm reset vector */
970 if (ret)
971 arch_cpuhp_cleanup_kick_cpu(cpu);
972 return ret;
973 }
974
native_kick_ap(unsigned int cpu,struct task_struct * tidle)975 int native_kick_ap(unsigned int cpu, struct task_struct *tidle)
976 {
977 u32 apicid = apic->cpu_present_to_apicid(cpu);
978 int err;
979
980 lockdep_assert_irqs_enabled();
981
982 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
983
984 if (apicid == BAD_APICID || !apic_id_valid(apicid)) {
985 pr_err("CPU %u has invalid APIC ID %x. Aborting bringup\n", cpu, apicid);
986 return -EINVAL;
987 }
988
989 if (!test_bit(apicid, phys_cpu_present_map)) {
990 pr_err("CPU %u APIC ID %x is not present. Aborting bringup\n", cpu, apicid);
991 return -EINVAL;
992 }
993
994 /*
995 * Save current MTRR state in case it was changed since early boot
996 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
997 */
998 mtrr_save_state();
999
1000 /* the FPU context is blank, nobody can own it */
1001 per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
1002
1003 err = common_cpu_up(cpu, tidle);
1004 if (err)
1005 return err;
1006
1007 err = do_boot_cpu(apicid, cpu, tidle);
1008 if (err)
1009 pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
1010
1011 return err;
1012 }
1013
arch_cpuhp_kick_ap_alive(unsigned int cpu,struct task_struct * tidle)1014 int arch_cpuhp_kick_ap_alive(unsigned int cpu, struct task_struct *tidle)
1015 {
1016 return smp_ops.kick_ap_alive(cpu, tidle);
1017 }
1018
arch_cpuhp_cleanup_kick_cpu(unsigned int cpu)1019 void arch_cpuhp_cleanup_kick_cpu(unsigned int cpu)
1020 {
1021 /* Cleanup possible dangling ends... */
1022 if (smp_ops.kick_ap_alive == native_kick_ap && x86_platform.legacy.warm_reset)
1023 smpboot_restore_warm_reset_vector();
1024 }
1025
arch_cpuhp_cleanup_dead_cpu(unsigned int cpu)1026 void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu)
1027 {
1028 if (smp_ops.cleanup_dead_cpu)
1029 smp_ops.cleanup_dead_cpu(cpu);
1030
1031 if (system_state == SYSTEM_RUNNING)
1032 pr_info("CPU %u is now offline\n", cpu);
1033 }
1034
arch_cpuhp_sync_state_poll(void)1035 void arch_cpuhp_sync_state_poll(void)
1036 {
1037 if (smp_ops.poll_sync_state)
1038 smp_ops.poll_sync_state();
1039 }
1040
1041 /**
1042 * arch_disable_smp_support() - Disables SMP support for x86 at boottime
1043 */
arch_disable_smp_support(void)1044 void __init arch_disable_smp_support(void)
1045 {
1046 disable_ioapic_support();
1047 }
1048
1049 /*
1050 * Fall back to non SMP mode after errors.
1051 *
1052 * RED-PEN audit/test this more. I bet there is more state messed up here.
1053 */
disable_smp(void)1054 static __init void disable_smp(void)
1055 {
1056 pr_info("SMP disabled\n");
1057
1058 disable_ioapic_support();
1059 topology_reset_possible_cpus_up();
1060
1061 cpumask_set_cpu(0, topology_sibling_cpumask(0));
1062 cpumask_set_cpu(0, topology_core_cpumask(0));
1063 cpumask_set_cpu(0, topology_die_cpumask(0));
1064 }
1065
smp_prepare_cpus_common(void)1066 void __init smp_prepare_cpus_common(void)
1067 {
1068 unsigned int cpu, node;
1069
1070 /* Mark all except the boot CPU as hotpluggable */
1071 for_each_possible_cpu(cpu) {
1072 if (cpu)
1073 per_cpu(cpu_info.cpu_index, cpu) = nr_cpu_ids;
1074 }
1075
1076 for_each_possible_cpu(cpu) {
1077 node = cpu_to_node(cpu);
1078
1079 zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL, node);
1080 zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, node);
1081 zalloc_cpumask_var_node(&per_cpu(cpu_die_map, cpu), GFP_KERNEL, node);
1082 zalloc_cpumask_var_node(&per_cpu(cpu_llc_shared_map, cpu), GFP_KERNEL, node);
1083 zalloc_cpumask_var_node(&per_cpu(cpu_l2c_shared_map, cpu), GFP_KERNEL, node);
1084 }
1085
1086 set_cpu_sibling_map(0);
1087 }
1088
smp_prepare_boot_cpu(void)1089 void __init smp_prepare_boot_cpu(void)
1090 {
1091 smp_ops.smp_prepare_boot_cpu();
1092 }
1093
1094 #ifdef CONFIG_X86_64
1095 /* Establish whether parallel bringup can be supported. */
arch_cpuhp_init_parallel_bringup(void)1096 bool __init arch_cpuhp_init_parallel_bringup(void)
1097 {
1098 if (!x86_cpuinit.parallel_bringup) {
1099 pr_info("Parallel CPU startup disabled by the platform\n");
1100 return false;
1101 }
1102
1103 smpboot_control = STARTUP_READ_APICID;
1104 pr_debug("Parallel CPU startup enabled: 0x%08x\n", smpboot_control);
1105 return true;
1106 }
1107 #endif
1108
1109 /*
1110 * Prepare for SMP bootup.
1111 * @max_cpus: configured maximum number of CPUs, It is a legacy parameter
1112 * for common interface support.
1113 */
native_smp_prepare_cpus(unsigned int max_cpus)1114 void __init native_smp_prepare_cpus(unsigned int max_cpus)
1115 {
1116 smp_prepare_cpus_common();
1117
1118 switch (apic_intr_mode) {
1119 case APIC_PIC:
1120 case APIC_VIRTUAL_WIRE_NO_CONFIG:
1121 disable_smp();
1122 return;
1123 case APIC_SYMMETRIC_IO_NO_ROUTING:
1124 disable_smp();
1125 /* Setup local timer */
1126 x86_init.timers.setup_percpu_clockev();
1127 return;
1128 case APIC_VIRTUAL_WIRE:
1129 case APIC_SYMMETRIC_IO:
1130 break;
1131 }
1132
1133 /* Setup local timer */
1134 x86_init.timers.setup_percpu_clockev();
1135
1136 pr_info("CPU0: ");
1137 print_cpu_info(&cpu_data(0));
1138
1139 uv_system_init();
1140
1141 smp_set_init_udelay();
1142
1143 speculative_store_bypass_ht_init();
1144
1145 snp_set_wakeup_secondary_cpu();
1146 }
1147
arch_thaw_secondary_cpus_begin(void)1148 void arch_thaw_secondary_cpus_begin(void)
1149 {
1150 set_cache_aps_delayed_init(true);
1151 }
1152
arch_thaw_secondary_cpus_end(void)1153 void arch_thaw_secondary_cpus_end(void)
1154 {
1155 cache_aps_init();
1156 }
1157
1158 /*
1159 * Early setup to make printk work.
1160 */
native_smp_prepare_boot_cpu(void)1161 void __init native_smp_prepare_boot_cpu(void)
1162 {
1163 int me = smp_processor_id();
1164
1165 /* SMP handles this from setup_per_cpu_areas() */
1166 if (!IS_ENABLED(CONFIG_SMP))
1167 switch_gdt_and_percpu_base(me);
1168
1169 native_pv_lock_init();
1170 }
1171
native_smp_cpus_done(unsigned int max_cpus)1172 void __init native_smp_cpus_done(unsigned int max_cpus)
1173 {
1174 pr_debug("Boot done\n");
1175
1176 build_sched_topology();
1177 nmi_selftest();
1178 impress_friends();
1179 cache_aps_init();
1180 }
1181
1182 /* correctly size the local cpu masks */
setup_cpu_local_masks(void)1183 void __init setup_cpu_local_masks(void)
1184 {
1185 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
1186 }
1187
1188 #ifdef CONFIG_HOTPLUG_CPU
1189
1190 /* Recompute SMT state for all CPUs on offline */
recompute_smt_state(void)1191 static void recompute_smt_state(void)
1192 {
1193 int max_threads, cpu;
1194
1195 max_threads = 0;
1196 for_each_online_cpu (cpu) {
1197 int threads = cpumask_weight(topology_sibling_cpumask(cpu));
1198
1199 if (threads > max_threads)
1200 max_threads = threads;
1201 }
1202 __max_smt_threads = max_threads;
1203 }
1204
remove_siblinginfo(int cpu)1205 static void remove_siblinginfo(int cpu)
1206 {
1207 int sibling;
1208 struct cpuinfo_x86 *c = &cpu_data(cpu);
1209
1210 for_each_cpu(sibling, topology_core_cpumask(cpu)) {
1211 cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
1212 /*/
1213 * last thread sibling in this cpu core going down
1214 */
1215 if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
1216 cpu_data(sibling).booted_cores--;
1217 }
1218
1219 for_each_cpu(sibling, topology_die_cpumask(cpu))
1220 cpumask_clear_cpu(cpu, topology_die_cpumask(sibling));
1221
1222 for_each_cpu(sibling, topology_sibling_cpumask(cpu)) {
1223 cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
1224 if (cpumask_weight(topology_sibling_cpumask(sibling)) == 1)
1225 cpu_data(sibling).smt_active = false;
1226 }
1227
1228 for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
1229 cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
1230 for_each_cpu(sibling, cpu_l2c_shared_mask(cpu))
1231 cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling));
1232 cpumask_clear(cpu_llc_shared_mask(cpu));
1233 cpumask_clear(cpu_l2c_shared_mask(cpu));
1234 cpumask_clear(topology_sibling_cpumask(cpu));
1235 cpumask_clear(topology_core_cpumask(cpu));
1236 cpumask_clear(topology_die_cpumask(cpu));
1237 c->topo.core_id = 0;
1238 c->booted_cores = 0;
1239 cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
1240 recompute_smt_state();
1241 }
1242
remove_cpu_from_maps(int cpu)1243 static void remove_cpu_from_maps(int cpu)
1244 {
1245 set_cpu_online(cpu, false);
1246 numa_remove_cpu(cpu);
1247 }
1248
cpu_disable_common(void)1249 void cpu_disable_common(void)
1250 {
1251 int cpu = smp_processor_id();
1252
1253 remove_siblinginfo(cpu);
1254
1255 /*
1256 * Stop allowing kernel-mode FPU. This is needed so that if the CPU is
1257 * brought online again, the initial state is not allowed:
1258 */
1259 this_cpu_write(kernel_fpu_allowed, false);
1260
1261 /* It's now safe to remove this processor from the online map */
1262 lock_vector_lock();
1263 remove_cpu_from_maps(cpu);
1264 unlock_vector_lock();
1265 fixup_irqs();
1266 lapic_offline();
1267 }
1268
native_cpu_disable(void)1269 int native_cpu_disable(void)
1270 {
1271 int ret;
1272
1273 ret = lapic_can_unplug_cpu();
1274 if (ret)
1275 return ret;
1276
1277 cpu_disable_common();
1278
1279 /*
1280 * Disable the local APIC. Otherwise IPI broadcasts will reach
1281 * it. It still responds normally to INIT, NMI, SMI, and SIPI
1282 * messages.
1283 *
1284 * Disabling the APIC must happen after cpu_disable_common()
1285 * which invokes fixup_irqs().
1286 *
1287 * Disabling the APIC preserves already set bits in IRR, but
1288 * an interrupt arriving after disabling the local APIC does not
1289 * set the corresponding IRR bit.
1290 *
1291 * fixup_irqs() scans IRR for set bits so it can raise a not
1292 * yet handled interrupt on the new destination CPU via an IPI
1293 * but obviously it can't do so for IRR bits which are not set.
1294 * IOW, interrupts arriving after disabling the local APIC will
1295 * be lost.
1296 */
1297 apic_soft_disable();
1298
1299 return 0;
1300 }
1301
play_dead_common(void)1302 void play_dead_common(void)
1303 {
1304 idle_task_exit();
1305
1306 cpuhp_ap_report_dead();
1307
1308 local_irq_disable();
1309 }
1310
1311 /*
1312 * We need to flush the caches before going to sleep, lest we have
1313 * dirty data in our caches when we come back up.
1314 */
mwait_play_dead(unsigned int eax_hint)1315 void __noreturn mwait_play_dead(unsigned int eax_hint)
1316 {
1317 struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
1318
1319 /* Set up state for the kexec() hack below */
1320 md->status = CPUDEAD_MWAIT_WAIT;
1321 md->control = CPUDEAD_MWAIT_WAIT;
1322
1323 wbinvd();
1324
1325 while (1) {
1326 /*
1327 * The CLFLUSH is a workaround for erratum AAI65 for
1328 * the Xeon 7400 series. It's not clear it is actually
1329 * needed, but it should be harmless in either case.
1330 * The WBINVD is insufficient due to the spurious-wakeup
1331 * case where we return around the loop.
1332 */
1333 mb();
1334 clflush(md);
1335 mb();
1336 __monitor(md, 0, 0);
1337 mb();
1338 __mwait(eax_hint, 0);
1339
1340 if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
1341 /*
1342 * Kexec is about to happen. Don't go back into mwait() as
1343 * the kexec kernel might overwrite text and data including
1344 * page tables and stack. So mwait() would resume when the
1345 * monitor cache line is written to and then the CPU goes
1346 * south due to overwritten text, page tables and stack.
1347 *
1348 * Note: This does _NOT_ protect against a stray MCE, NMI,
1349 * SMI. They will resume execution at the instruction
1350 * following the HLT instruction and run into the problem
1351 * which this is trying to prevent.
1352 */
1353 WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
1354 while(1)
1355 native_halt();
1356 }
1357 }
1358 }
1359
1360 /*
1361 * Kick all "offline" CPUs out of mwait on kexec(). See comment in
1362 * mwait_play_dead().
1363 */
smp_kick_mwait_play_dead(void)1364 void smp_kick_mwait_play_dead(void)
1365 {
1366 u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
1367 struct mwait_cpu_dead *md;
1368 unsigned int cpu, i;
1369
1370 for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
1371 md = per_cpu_ptr(&mwait_cpu_dead, cpu);
1372
1373 /* Does it sit in mwait_play_dead() ? */
1374 if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
1375 continue;
1376
1377 /* Wait up to 5ms */
1378 for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
1379 /* Bring it out of mwait */
1380 WRITE_ONCE(md->control, newstate);
1381 udelay(5);
1382 }
1383
1384 if (READ_ONCE(md->status) != newstate)
1385 pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
1386 }
1387 }
1388
hlt_play_dead(void)1389 void __noreturn hlt_play_dead(void)
1390 {
1391 if (__this_cpu_read(cpu_info.x86) >= 4)
1392 wbinvd();
1393
1394 while (1)
1395 native_halt();
1396 }
1397
native_play_dead(void)1398 void __noreturn native_play_dead(void)
1399 {
1400 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
1401 __update_spec_ctrl(0);
1402
1403 play_dead_common();
1404 tboot_shutdown(TB_SHUTDOWN_WFS);
1405
1406 /* Below returns only on error. */
1407 cpuidle_play_dead();
1408 hlt_play_dead();
1409 }
1410
1411 #else /* ... !CONFIG_HOTPLUG_CPU */
native_cpu_disable(void)1412 int native_cpu_disable(void)
1413 {
1414 return -ENOSYS;
1415 }
1416
native_play_dead(void)1417 void __noreturn native_play_dead(void)
1418 {
1419 BUG();
1420 }
1421
1422 #endif
1423