1 /*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2003, by Peter Wemm
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 * derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 #include "opt_acpi.h"
29 #ifdef __i386__
30 #include "opt_apic.h"
31 #endif
32 #include "opt_cpu.h"
33 #include "opt_ddb.h"
34 #include "opt_gdb.h"
35 #include "opt_kstack_pages.h"
36 #include "opt_pmap.h"
37 #include "opt_sched.h"
38 #include "opt_smp.h"
39 #include "opt_stack.h"
40
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/asan.h>
44 #include <sys/bus.h>
45 #include <sys/cons.h> /* cngetc() */
46 #include <sys/cpuset.h>
47 #include <sys/csan.h>
48 #include <sys/interrupt.h>
49 #include <sys/kdb.h>
50 #include <sys/kernel.h>
51 #include <sys/ktr.h>
52 #include <sys/lock.h>
53 #include <sys/malloc.h>
54 #include <sys/memrange.h>
55 #include <sys/mutex.h>
56 #include <sys/pcpu.h>
57 #include <sys/proc.h>
58 #include <sys/sched.h>
59 #include <sys/smp.h>
60 #include <sys/sysctl.h>
61
62 #include <vm/vm.h>
63 #include <vm/vm_param.h>
64 #include <vm/pmap.h>
65 #include <vm/vm_kern.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68
69 #include <x86/apicreg.h>
70 #include <machine/clock.h>
71 #include <machine/cpu.h>
72 #include <machine/cputypes.h>
73 #include <x86/mca.h>
74 #include <machine/md_var.h>
75 #include <machine/pcb.h>
76 #include <machine/psl.h>
77 #include <machine/smp.h>
78 #include <machine/specialreg.h>
79 #include <machine/stack.h>
80 #include <x86/ucode.h>
81
82 #ifdef DEV_ACPI
83 #include <contrib/dev/acpica/include/acpi.h>
84 #include <dev/acpica/acpivar.h>
85 #endif
86
87 static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items");
88
89 int mp_naps; /* # of Applications processors */
90 int boot_cpu_id = -1; /* designated BSP */
91
92 /* AP uses this during bootstrap. Do not staticize. */
93 char *bootSTK;
94 int bootAP;
95
96 /* Free these after use */
97 void *bootstacks[MAXCPU];
98 void *dpcpu;
99
100 struct susppcb **susppcbs;
101
102 #ifdef COUNT_IPIS
103 /* Interrupt counts. */
104 static u_long *ipi_preempt_counts[MAXCPU];
105 static u_long *ipi_ast_counts[MAXCPU];
106 u_long *ipi_invltlb_counts[MAXCPU];
107 u_long *ipi_invlrng_counts[MAXCPU];
108 u_long *ipi_invlpg_counts[MAXCPU];
109 u_long *ipi_invlcache_counts[MAXCPU];
110 u_long *ipi_rendezvous_counts[MAXCPU];
111 static u_long *ipi_hardclock_counts[MAXCPU];
112 #endif
113
114 /* Default cpu_ops implementation. */
115 struct cpu_ops cpu_ops;
116
117 /*
118 * Local data and functions.
119 */
120
121 static volatile cpuset_t ipi_stop_nmi_pending;
122
123 volatile cpuset_t resuming_cpus;
124 volatile cpuset_t toresume_cpus;
125
126 /* used to hold the AP's until we are ready to release them */
127 static int ap_boot_lock;
128
129 /* Set to 1 once we're ready to let the APs out of the pen. */
130 volatile int aps_ready = 0;
131
132 /*
133 * Store data from cpu_add() until later in the boot when we actually setup
134 * the APs.
135 */
136 struct cpu_info *cpu_info;
137 int *apic_cpuids;
138 int cpu_apic_ids[MAXCPU];
139 _Static_assert(MAXCPU <= MAX_APIC_ID,
140 "MAXCPU cannot be larger that MAX_APIC_ID");
141 _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID,
142 "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID");
143
144 static void release_aps(void *dummy);
145 static void cpustop_handler_post(u_int cpu);
146
147 static int hyperthreading_allowed = 1;
148 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
149 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
150
151 static int hyperthreading_intr_allowed = 0;
152 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN,
153 &hyperthreading_intr_allowed, 0,
154 "Allow interrupts on HTT logical CPUs");
155
156 static int intr_apic_id_limit = -1;
157 SYSCTL_INT(_machdep, OID_AUTO, intr_apic_id_limit, CTLFLAG_RDTUN,
158 &intr_apic_id_limit, 0,
159 "Maximum permitted APIC ID for interrupt delivery (-1 is unlimited)");
160
161 static struct topo_node topo_root;
162
163 static int pkg_id_shift;
164 static int node_id_shift;
165 static int core_id_shift;
166 static int disabled_cpus;
167
168 struct cache_info {
169 int id_shift;
170 int present;
171 } static caches[MAX_CACHE_LEVELS];
172
173 static bool stop_mwait = false;
174 SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0,
175 "Use MONITOR/MWAIT when stopping CPU, if available");
176
177 void
mem_range_AP_init(void)178 mem_range_AP_init(void)
179 {
180
181 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
182 mem_range_softc.mr_op->initAP(&mem_range_softc);
183 }
184
185 /*
186 * Compute ceil(log2(x)). Returns -1 if x is zero.
187 */
188 static __inline int
mask_width(u_int x)189 mask_width(u_int x)
190 {
191
192 return (x == 0 ? -1 : order_base_2(x));
193 }
194
195 /*
196 * Add a cache level to the cache topology description.
197 */
198 static int
add_deterministic_cache(int type,int level,int share_count)199 add_deterministic_cache(int type, int level, int share_count)
200 {
201
202 if (type == 0)
203 return (0);
204 if (type > 3) {
205 printf("unexpected cache type %d\n", type);
206 return (1);
207 }
208 if (type == 2) /* ignore instruction cache */
209 return (1);
210 if (level == 0 || level > MAX_CACHE_LEVELS) {
211 printf("unexpected cache level %d\n", level);
212 return (1);
213 }
214
215 if (caches[level - 1].present) {
216 printf("WARNING: multiple entries for L%u data cache\n", level);
217 printf("%u => %u\n", caches[level - 1].id_shift,
218 mask_width(share_count));
219 }
220 caches[level - 1].id_shift = mask_width(share_count);
221 caches[level - 1].present = 1;
222
223 if (caches[level - 1].id_shift > pkg_id_shift) {
224 printf("WARNING: L%u data cache covers more "
225 "APIC IDs than a package (%u > %u)\n", level,
226 caches[level - 1].id_shift, pkg_id_shift);
227 caches[level - 1].id_shift = pkg_id_shift;
228 }
229 if (caches[level - 1].id_shift < core_id_shift) {
230 printf("WARNING: L%u data cache covers fewer "
231 "APIC IDs than a core (%u < %u)\n", level,
232 caches[level - 1].id_shift, core_id_shift);
233 caches[level - 1].id_shift = core_id_shift;
234 }
235
236 return (1);
237 }
238
239 /*
240 * Determine topology of processing units and caches for AMD CPUs.
241 * See:
242 * - AMD CPUID Specification (Publication # 25481)
243 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
244 * - BKDG For AMD Family 10h Processors (Publication # 31116)
245 * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301)
246 * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751)
247 * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945)
248 */
249 static void
topo_probe_amd(void)250 topo_probe_amd(void)
251 {
252 u_int p[4];
253 uint64_t v;
254 int level;
255 int nodes_per_socket;
256 int share_count;
257 int type;
258 int i;
259
260 /* No multi-core capability. */
261 if ((amd_feature2 & AMDID2_CMP) == 0)
262 return;
263
264 /*
265 * XXX Lack of an AMD IOMMU driver prevents use of APIC IDs above
266 * xAPIC_MAX_APIC_ID. This is a workaround so we boot and function on
267 * AMD systems with high thread counts, albeit with reduced interrupt
268 * performance.
269 *
270 * We should really set the limit to xAPIC_MAX_APIC_ID by default, and
271 * have the IOMMU driver increase it. That way if a driver is present
272 * but disabled, or is otherwise not able to route the interrupts, the
273 * system can fall back to a functional state. That will require a more
274 * substantial change though, including having the IOMMU initialize
275 * earlier.
276 */
277 if (intr_apic_id_limit == -1)
278 intr_apic_id_limit = xAPIC_MAX_APIC_ID;
279
280 /* For families 10h and newer. */
281 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
282 AMDID_COREID_SIZE_SHIFT;
283
284 /* For 0Fh family. */
285 if (pkg_id_shift == 0)
286 pkg_id_shift =
287 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
288
289 /*
290 * Families prior to 16h define the following value as
291 * cores per compute unit and we don't really care about the AMD
292 * compute units at the moment. Perhaps we should treat them as
293 * cores and cores within the compute units as hardware threads,
294 * but that's up for debate.
295 * Later families define the value as threads per compute unit,
296 * so we are following AMD's nomenclature here.
297 */
298 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 &&
299 CPUID_TO_FAMILY(cpu_id) >= 0x16) {
300 cpuid_count(0x8000001e, 0, p);
301 share_count = ((p[1] >> 8) & 0xff) + 1;
302 core_id_shift = mask_width(share_count);
303
304 /*
305 * For Zen (17h), gather Nodes per Processor. Each node is a
306 * Zeppelin die; TR and EPYC CPUs will have multiple dies per
307 * package. Communication latency between dies is higher than
308 * within them.
309 */
310 nodes_per_socket = ((p[2] >> 8) & 0x7) + 1;
311 node_id_shift = pkg_id_shift - mask_width(nodes_per_socket);
312 }
313
314 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
315 for (i = 0; ; i++) {
316 cpuid_count(0x8000001d, i, p);
317 type = p[0] & 0x1f;
318 level = (p[0] >> 5) & 0x7;
319 share_count = 1 + ((p[0] >> 14) & 0xfff);
320
321 if (!add_deterministic_cache(type, level, share_count))
322 break;
323 }
324 } else {
325 if (cpu_exthigh >= 0x80000005) {
326 cpuid_count(0x80000005, 0, p);
327 if (((p[2] >> 24) & 0xff) != 0) {
328 caches[0].id_shift = 0;
329 caches[0].present = 1;
330 }
331 }
332 if (cpu_exthigh >= 0x80000006) {
333 cpuid_count(0x80000006, 0, p);
334 if (((p[2] >> 16) & 0xffff) != 0) {
335 caches[1].id_shift = 0;
336 caches[1].present = 1;
337 }
338 if (((p[3] >> 18) & 0x3fff) != 0) {
339 nodes_per_socket = 1;
340 if ((amd_feature2 & AMDID2_NODE_ID) != 0) {
341 /*
342 * Handle multi-node processors that
343 * have multiple chips, each with its
344 * own L3 cache, on the same die.
345 */
346 v = rdmsr(0xc001100c);
347 nodes_per_socket = 1 + ((v >> 3) & 0x7);
348 }
349 caches[2].id_shift =
350 pkg_id_shift - mask_width(nodes_per_socket);
351 caches[2].present = 1;
352 }
353 }
354 }
355 }
356
357 /*
358 * Determine topology of processing units for Intel CPUs
359 * using CPUID Leaf 1 and Leaf 4, if supported.
360 * See:
361 * - Intel 64 Architecture Processor Topology Enumeration
362 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
363 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
364 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
365 */
366 static void
topo_probe_intel_0x4(void)367 topo_probe_intel_0x4(void)
368 {
369 u_int p[4];
370 int max_cores;
371 int max_logical;
372
373 /* Both zero and one here mean one logical processor per package. */
374 max_logical = (cpu_feature & CPUID_HTT) != 0 ?
375 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
376 if (max_logical <= 1)
377 return;
378
379 if (cpu_high >= 0x4) {
380 cpuid_count(0x04, 0, p);
381 max_cores = ((p[0] >> 26) & 0x3f) + 1;
382 } else
383 max_cores = 1;
384
385 core_id_shift = mask_width(max_logical/max_cores);
386 KASSERT(core_id_shift >= 0,
387 ("intel topo: max_cores > max_logical\n"));
388 pkg_id_shift = core_id_shift + mask_width(max_cores);
389 }
390
391 /*
392 * Determine topology of processing units for Intel CPUs
393 * using CPUID Leaf 1Fh or 0Bh, if supported.
394 * See:
395 * - Intel 64 Architecture Processor Topology Enumeration
396 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
397 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
398 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
399 */
400 static void
topo_probe_intel_0xb(void)401 topo_probe_intel_0xb(void)
402 {
403 u_int leaf;
404 u_int p[4] = { 0 };
405 int bits;
406 int type;
407 int i;
408
409 /* Prefer leaf 1Fh (V2 Extended Topology Enumeration). */
410 if (cpu_high >= 0x1f) {
411 leaf = 0x1f;
412 cpuid_count(leaf, 0, p);
413 }
414 /* Fall back to leaf 0Bh (Extended Topology Enumeration). */
415 if (p[1] == 0) {
416 leaf = 0x0b;
417 cpuid_count(leaf, 0, p);
418 }
419 /* Fall back to leaf 04h (Deterministic Cache Parameters). */
420 if (p[1] == 0) {
421 topo_probe_intel_0x4();
422 return;
423 }
424
425 /* We only support three levels for now. */
426 for (i = 0; ; i++) {
427 cpuid_count(leaf, i, p);
428
429 bits = p[0] & 0x1f;
430 type = (p[2] >> 8) & 0xff;
431
432 if (type == 0)
433 break;
434
435 if (type == CPUID_TYPE_SMT)
436 core_id_shift = bits;
437 else if (type == CPUID_TYPE_CORE)
438 pkg_id_shift = bits;
439 else if (bootverbose)
440 printf("Topology level type %d shift: %d\n", type, bits);
441 }
442
443 if (pkg_id_shift < core_id_shift) {
444 printf("WARNING: core covers more APIC IDs than a package\n");
445 core_id_shift = pkg_id_shift;
446 }
447 }
448
449 /*
450 * Determine topology of caches for Intel CPUs.
451 * See:
452 * - Intel 64 Architecture Processor Topology Enumeration
453 * - Intel 64 and IA-32 Architectures Software Developer’s Manual
454 * Volume 2A: Instruction Set Reference, A-M,
455 * CPUID instruction
456 */
457 static void
topo_probe_intel_caches(void)458 topo_probe_intel_caches(void)
459 {
460 u_int p[4];
461 int level;
462 int share_count;
463 int type;
464 int i;
465
466 if (cpu_high < 0x4) {
467 /*
468 * Available cache level and sizes can be determined
469 * via CPUID leaf 2, but that requires a huge table of hardcoded
470 * values, so for now just assume L1 and L2 caches potentially
471 * shared only by HTT processing units, if HTT is present.
472 */
473 caches[0].id_shift = pkg_id_shift;
474 caches[0].present = 1;
475 caches[1].id_shift = pkg_id_shift;
476 caches[1].present = 1;
477 return;
478 }
479
480 for (i = 0; ; i++) {
481 cpuid_count(0x4, i, p);
482 type = p[0] & 0x1f;
483 level = (p[0] >> 5) & 0x7;
484 share_count = 1 + ((p[0] >> 14) & 0xfff);
485
486 if (!add_deterministic_cache(type, level, share_count))
487 break;
488 }
489 }
490
491 /*
492 * Determine topology of processing units and caches for Intel CPUs.
493 * See:
494 * - Intel 64 Architecture Processor Topology Enumeration
495 */
496 static void
topo_probe_intel(void)497 topo_probe_intel(void)
498 {
499
500 /*
501 * Note that 0x1 <= cpu_high < 4 case should be
502 * compatible with topo_probe_intel_0x4() logic when
503 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
504 * or it should trigger the fallback otherwise.
505 */
506 if (cpu_high >= 0xb)
507 topo_probe_intel_0xb();
508 else if (cpu_high >= 0x1)
509 topo_probe_intel_0x4();
510
511 topo_probe_intel_caches();
512 }
513
514 /*
515 * Topology information is queried only on BSP, on which this
516 * code runs and for which it can query CPUID information.
517 * Then topology is extrapolated on all packages using an
518 * assumption that APIC ID to hardware component ID mapping is
519 * homogenious.
520 * That doesn't necesserily imply that the topology is uniform.
521 */
522 void
topo_probe(void)523 topo_probe(void)
524 {
525 static int cpu_topo_probed = 0;
526 struct x86_topo_layer {
527 int type;
528 int subtype;
529 int id_shift;
530 } topo_layers[MAX_CACHE_LEVELS + 5];
531 struct topo_node *parent;
532 struct topo_node *node;
533 int layer;
534 int nlayers;
535 int node_id;
536 int i;
537 #if defined(DEV_ACPI) && MAXMEMDOM > 1
538 int d, domain;
539 #endif
540
541 if (cpu_topo_probed)
542 return;
543
544 CPU_ZERO(&logical_cpus_mask);
545
546 if (mp_ncpus <= 1)
547 ; /* nothing */
548 else if (cpu_vendor_id == CPU_VENDOR_AMD ||
549 cpu_vendor_id == CPU_VENDOR_HYGON)
550 topo_probe_amd();
551 else if (cpu_vendor_id == CPU_VENDOR_INTEL)
552 topo_probe_intel();
553
554 KASSERT(pkg_id_shift >= core_id_shift,
555 ("bug in APIC topology discovery"));
556
557 nlayers = 0;
558 bzero(topo_layers, sizeof(topo_layers));
559
560 topo_layers[nlayers].type = TOPO_TYPE_PKG;
561 topo_layers[nlayers].id_shift = pkg_id_shift;
562 if (bootverbose)
563 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
564 nlayers++;
565
566 if (pkg_id_shift > node_id_shift && node_id_shift != 0) {
567 topo_layers[nlayers].type = TOPO_TYPE_GROUP;
568 topo_layers[nlayers].id_shift = node_id_shift;
569 if (bootverbose)
570 printf("Node ID shift: %u\n",
571 topo_layers[nlayers].id_shift);
572 nlayers++;
573 }
574
575 /*
576 * Consider all caches to be within a package/chip
577 * and "in front" of all sub-components like
578 * cores and hardware threads.
579 */
580 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
581 if (caches[i].present) {
582 if (node_id_shift != 0)
583 KASSERT(caches[i].id_shift <= node_id_shift,
584 ("bug in APIC topology discovery"));
585 KASSERT(caches[i].id_shift <= pkg_id_shift,
586 ("bug in APIC topology discovery"));
587 KASSERT(caches[i].id_shift >= core_id_shift,
588 ("bug in APIC topology discovery"));
589
590 topo_layers[nlayers].type = TOPO_TYPE_CACHE;
591 topo_layers[nlayers].subtype = i + 1;
592 topo_layers[nlayers].id_shift = caches[i].id_shift;
593 if (bootverbose)
594 printf("L%u cache ID shift: %u\n",
595 topo_layers[nlayers].subtype,
596 topo_layers[nlayers].id_shift);
597 nlayers++;
598 }
599 }
600
601 if (pkg_id_shift > core_id_shift) {
602 topo_layers[nlayers].type = TOPO_TYPE_CORE;
603 topo_layers[nlayers].id_shift = core_id_shift;
604 if (bootverbose)
605 printf("Core ID shift: %u\n",
606 topo_layers[nlayers].id_shift);
607 nlayers++;
608 }
609
610 topo_layers[nlayers].type = TOPO_TYPE_PU;
611 topo_layers[nlayers].id_shift = 0;
612 nlayers++;
613
614 #if defined(DEV_ACPI) && MAXMEMDOM > 1
615 if (vm_ndomains > 1) {
616 for (layer = 0; layer < nlayers; ++layer) {
617 for (i = 0; i <= max_apic_id; ++i) {
618 if ((i & ((1 << topo_layers[layer].id_shift) - 1)) == 0)
619 domain = -1;
620 if (!cpu_info[i].cpu_present)
621 continue;
622 d = acpi_pxm_get_cpu_locality(i);
623 if (domain >= 0 && domain != d)
624 break;
625 domain = d;
626 }
627 if (i > max_apic_id)
628 break;
629 }
630 KASSERT(layer < nlayers, ("NUMA domain smaller than PU"));
631 memmove(&topo_layers[layer+1], &topo_layers[layer],
632 sizeof(*topo_layers) * (nlayers - layer));
633 topo_layers[layer].type = TOPO_TYPE_NODE;
634 topo_layers[layer].subtype = CG_SHARE_NONE;
635 nlayers++;
636 }
637 #endif
638
639 topo_init_root(&topo_root);
640 for (i = 0; i <= max_apic_id; ++i) {
641 if (!cpu_info[i].cpu_present)
642 continue;
643
644 parent = &topo_root;
645 for (layer = 0; layer < nlayers; ++layer) {
646 #if defined(DEV_ACPI) && MAXMEMDOM > 1
647 if (topo_layers[layer].type == TOPO_TYPE_NODE) {
648 node_id = acpi_pxm_get_cpu_locality(i);
649 } else
650 #endif
651 node_id = i >> topo_layers[layer].id_shift;
652 parent = topo_add_node_by_hwid(parent, node_id,
653 topo_layers[layer].type,
654 topo_layers[layer].subtype);
655 }
656 }
657
658 parent = &topo_root;
659 for (layer = 0; layer < nlayers; ++layer) {
660 #if defined(DEV_ACPI) && MAXMEMDOM > 1
661 if (topo_layers[layer].type == TOPO_TYPE_NODE)
662 node_id = acpi_pxm_get_cpu_locality(boot_cpu_id);
663 else
664 #endif
665 node_id = boot_cpu_id >> topo_layers[layer].id_shift;
666 node = topo_find_node_by_hwid(parent, node_id,
667 topo_layers[layer].type,
668 topo_layers[layer].subtype);
669 topo_promote_child(node);
670 parent = node;
671 }
672
673 cpu_topo_probed = 1;
674 }
675
676 /*
677 * Assign logical CPU IDs to local APICs.
678 */
679 void
assign_cpu_ids(void)680 assign_cpu_ids(void)
681 {
682 struct topo_node *node;
683 u_int smt_mask;
684 int nhyper;
685
686 smt_mask = (1u << core_id_shift) - 1;
687
688 /*
689 * Assign CPU IDs to local APIC IDs and disable any CPUs
690 * beyond MAXCPU. CPU 0 is always assigned to the BSP.
691 */
692 mp_ncpus = 0;
693 nhyper = 0;
694 TOPO_FOREACH(node, &topo_root) {
695 if (node->type != TOPO_TYPE_PU)
696 continue;
697
698 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
699 cpu_info[node->hwid].cpu_hyperthread = 1;
700
701 if (resource_disabled("lapic", node->hwid)) {
702 if (node->hwid != boot_cpu_id)
703 cpu_info[node->hwid].cpu_disabled = 1;
704 else
705 printf("Cannot disable BSP, APIC ID = %d\n",
706 node->hwid);
707 }
708
709 if (!hyperthreading_allowed &&
710 cpu_info[node->hwid].cpu_hyperthread)
711 cpu_info[node->hwid].cpu_disabled = 1;
712
713 if (mp_ncpus >= MAXCPU)
714 cpu_info[node->hwid].cpu_disabled = 1;
715
716 if (cpu_info[node->hwid].cpu_disabled) {
717 disabled_cpus++;
718 continue;
719 }
720
721 if (cpu_info[node->hwid].cpu_hyperthread)
722 nhyper++;
723
724 cpu_apic_ids[mp_ncpus] = node->hwid;
725 apic_cpuids[node->hwid] = mp_ncpus;
726 topo_set_pu_id(node, mp_ncpus);
727 mp_ncpus++;
728 }
729
730 KASSERT(mp_maxid >= mp_ncpus - 1,
731 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
732 mp_ncpus));
733
734 mp_ncores = mp_ncpus - nhyper;
735 smp_threads_per_core = mp_ncpus / mp_ncores;
736 }
737
738 /*
739 * Print various information about the SMP system hardware and setup.
740 */
741 void
cpu_mp_announce(void)742 cpu_mp_announce(void)
743 {
744 struct topo_node *node;
745 const char *hyperthread;
746 struct topo_analysis topology;
747
748 printf("FreeBSD/SMP: ");
749 if (topo_analyze(&topo_root, 1, &topology)) {
750 printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]);
751 if (topology.entities[TOPO_LEVEL_GROUP] > 1)
752 printf(" x %d groups",
753 topology.entities[TOPO_LEVEL_GROUP]);
754 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
755 printf(" x %d cache groups",
756 topology.entities[TOPO_LEVEL_CACHEGROUP]);
757 if (topology.entities[TOPO_LEVEL_CORE] > 0)
758 printf(" x %d core(s)",
759 topology.entities[TOPO_LEVEL_CORE]);
760 if (topology.entities[TOPO_LEVEL_THREAD] > 1)
761 printf(" x %d hardware threads",
762 topology.entities[TOPO_LEVEL_THREAD]);
763 } else {
764 printf("Non-uniform topology");
765 }
766 printf("\n");
767
768 if (disabled_cpus) {
769 printf("FreeBSD/SMP Online: ");
770 if (topo_analyze(&topo_root, 0, &topology)) {
771 printf("%d package(s)",
772 topology.entities[TOPO_LEVEL_PKG]);
773 if (topology.entities[TOPO_LEVEL_GROUP] > 1)
774 printf(" x %d groups",
775 topology.entities[TOPO_LEVEL_GROUP]);
776 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
777 printf(" x %d cache groups",
778 topology.entities[TOPO_LEVEL_CACHEGROUP]);
779 if (topology.entities[TOPO_LEVEL_CORE] > 0)
780 printf(" x %d core(s)",
781 topology.entities[TOPO_LEVEL_CORE]);
782 if (topology.entities[TOPO_LEVEL_THREAD] > 1)
783 printf(" x %d hardware threads",
784 topology.entities[TOPO_LEVEL_THREAD]);
785 } else {
786 printf("Non-uniform topology");
787 }
788 printf("\n");
789 }
790
791 if (!bootverbose)
792 return;
793
794 TOPO_FOREACH(node, &topo_root) {
795 switch (node->type) {
796 case TOPO_TYPE_PKG:
797 printf("Package HW ID = %u\n", node->hwid);
798 break;
799 case TOPO_TYPE_CORE:
800 printf("\tCore HW ID = %u\n", node->hwid);
801 break;
802 case TOPO_TYPE_PU:
803 if (cpu_info[node->hwid].cpu_hyperthread)
804 hyperthread = "/HT";
805 else
806 hyperthread = "";
807
808 if (node->subtype == 0)
809 printf("\t\tCPU (AP%s): APIC ID: %u"
810 "(disabled)\n", hyperthread, node->hwid);
811 else if (node->id == 0)
812 printf("\t\tCPU0 (BSP): APIC ID: %u\n",
813 node->hwid);
814 else
815 printf("\t\tCPU%u (AP%s): APIC ID: %u\n",
816 node->id, hyperthread, node->hwid);
817 break;
818 default:
819 /* ignored */
820 break;
821 }
822 }
823 }
824
825 /*
826 * Add a scheduling group, a group of logical processors sharing
827 * a particular cache (and, thus having an affinity), to the scheduling
828 * topology.
829 * This function recursively works on lower level caches.
830 */
831 static void
x86topo_add_sched_group(struct topo_node * root,struct cpu_group * cg_root)832 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
833 {
834 struct topo_node *node;
835 int nchildren;
836 int ncores;
837 int i;
838
839 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE ||
840 root->type == TOPO_TYPE_NODE || root->type == TOPO_TYPE_GROUP,
841 ("x86topo_add_sched_group: bad type: %u", root->type));
842 CPU_COPY(&root->cpuset, &cg_root->cg_mask);
843 cg_root->cg_count = root->cpu_count;
844 if (root->type == TOPO_TYPE_CACHE)
845 cg_root->cg_level = root->subtype;
846 else
847 cg_root->cg_level = CG_SHARE_NONE;
848 if (root->type == TOPO_TYPE_NODE)
849 cg_root->cg_flags = CG_FLAG_NODE;
850 else
851 cg_root->cg_flags = 0;
852
853 /*
854 * Check how many core nodes we have under the given root node.
855 * If we have multiple logical processors, but not multiple
856 * cores, then those processors must be hardware threads.
857 */
858 ncores = 0;
859 node = root;
860 while (node != NULL) {
861 if (node->type != TOPO_TYPE_CORE) {
862 node = topo_next_node(root, node);
863 continue;
864 }
865
866 ncores++;
867 node = topo_next_nonchild_node(root, node);
868 }
869
870 if (cg_root->cg_level != CG_SHARE_NONE &&
871 root->cpu_count > 1 && ncores < 2)
872 cg_root->cg_flags |= CG_FLAG_SMT;
873
874 /*
875 * Find out how many cache nodes we have under the given root node.
876 * We ignore cache nodes that cover all the same processors as the
877 * root node. Also, we do not descend below found cache nodes.
878 * That is, we count top-level "non-redundant" caches under the root
879 * node.
880 */
881 nchildren = 0;
882 node = root;
883 while (node != NULL) {
884 /*
885 * When some APICs are disabled by tunables, nodes can end up
886 * with an empty cpuset. Nodes with an empty cpuset will be
887 * translated into cpu groups with empty cpusets. smp_topo_fill
888 * will then set cg_first and cg_last to -1. This isn't
889 * correctly handled in all functions. E.g. when
890 * cpu_search_lowest and cpu_search_highest loop through all
891 * cpus, they call CPU_ISSET on cpu -1 which ends up in a
892 * general protection fault.
893 *
894 * We could fix the scheduler to handle empty cpu groups
895 * correctly. Nevertheless, empty cpu groups are causing
896 * overhead for no value. So, it makes more sense to just don't
897 * create them.
898 */
899 if (CPU_EMPTY(&node->cpuset)) {
900 node = topo_next_node(root, node);
901 continue;
902 }
903 if (CPU_CMP(&node->cpuset, &root->cpuset) == 0) {
904 if (node->type == TOPO_TYPE_CACHE &&
905 cg_root->cg_level < node->subtype)
906 cg_root->cg_level = node->subtype;
907 if (node->type == TOPO_TYPE_NODE)
908 cg_root->cg_flags |= CG_FLAG_NODE;
909 node = topo_next_node(root, node);
910 continue;
911 }
912 if (node->type != TOPO_TYPE_GROUP &&
913 node->type != TOPO_TYPE_NODE &&
914 node->type != TOPO_TYPE_CACHE) {
915 node = topo_next_node(root, node);
916 continue;
917 }
918 nchildren++;
919 node = topo_next_nonchild_node(root, node);
920 }
921
922 /*
923 * We are not interested in nodes including only one CPU each.
924 */
925 if (nchildren == root->cpu_count)
926 return;
927
928 /*
929 * We are not interested in nodes without children.
930 */
931 cg_root->cg_children = nchildren;
932 if (nchildren == 0)
933 return;
934
935 cg_root->cg_child = smp_topo_alloc(nchildren);
936
937 /*
938 * Now find again the same cache nodes as above and recursively
939 * build scheduling topologies for them.
940 */
941 node = root;
942 i = 0;
943 while (node != NULL) {
944 if ((node->type != TOPO_TYPE_GROUP &&
945 node->type != TOPO_TYPE_NODE &&
946 node->type != TOPO_TYPE_CACHE) ||
947 CPU_CMP(&node->cpuset, &root->cpuset) == 0 ||
948 CPU_EMPTY(&node->cpuset)) {
949 node = topo_next_node(root, node);
950 continue;
951 }
952 cg_root->cg_child[i].cg_parent = cg_root;
953 x86topo_add_sched_group(node, &cg_root->cg_child[i]);
954 i++;
955 node = topo_next_nonchild_node(root, node);
956 }
957 }
958
959 /*
960 * Build the MI scheduling topology from the discovered hardware topology.
961 */
962 struct cpu_group *
cpu_topo(void)963 cpu_topo(void)
964 {
965 struct cpu_group *cg_root;
966
967 if (mp_ncpus <= 1)
968 return (smp_topo_none());
969
970 cg_root = smp_topo_alloc(1);
971 x86topo_add_sched_group(&topo_root, cg_root);
972 return (cg_root);
973 }
974
975 static void
cpu_alloc(void * dummy __unused)976 cpu_alloc(void *dummy __unused)
977 {
978 /*
979 * Dynamically allocate the arrays that depend on the
980 * maximum APIC ID.
981 */
982 cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS,
983 M_WAITOK | M_ZERO);
984 apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS,
985 M_WAITOK | M_ZERO);
986 }
987 SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL);
988
989 /*
990 * Add a logical CPU to the topology.
991 */
992 void
cpu_add(u_int apic_id,char boot_cpu)993 cpu_add(u_int apic_id, char boot_cpu)
994 {
995
996 if (apic_id > max_apic_id)
997 panic("SMP: APIC ID %d too high", apic_id);
998
999 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice",
1000 apic_id));
1001 cpu_info[apic_id].cpu_present = 1;
1002 if (boot_cpu) {
1003 KASSERT(boot_cpu_id == -1,
1004 ("CPU %u claims to be BSP, but CPU %u already is", apic_id,
1005 boot_cpu_id));
1006 boot_cpu_id = apic_id;
1007 cpu_info[apic_id].cpu_bsp = 1;
1008 }
1009 if (bootverbose)
1010 printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" :
1011 "AP");
1012 }
1013
1014 void
cpu_mp_setmaxid(void)1015 cpu_mp_setmaxid(void)
1016 {
1017
1018 /*
1019 * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
1020 * If there were no calls to cpu_add() assume this is a UP system.
1021 */
1022 if (mp_ncpus == 0)
1023 mp_ncpus = 1;
1024 }
1025
1026 int
cpu_mp_probe(void)1027 cpu_mp_probe(void)
1028 {
1029
1030 /*
1031 * Always record BSP in CPU map so that the mbuf init code works
1032 * correctly.
1033 */
1034 CPU_SETOF(0, &all_cpus);
1035 return (mp_ncpus > 1);
1036 }
1037
1038 /*
1039 * AP CPU's call this to initialize themselves.
1040 */
1041 void
init_secondary_tail(void)1042 init_secondary_tail(void)
1043 {
1044 u_int cpuid;
1045
1046 pmap_activate_boot(vmspace_pmap(proc0.p_vmspace));
1047
1048 /*
1049 * On real hardware, switch to x2apic mode if possible. Do it
1050 * after aps_ready was signalled, to avoid manipulating the
1051 * mode while BSP might still want to send some IPI to us
1052 * (second startup IPI is ignored on modern hardware etc).
1053 */
1054 lapic_xapic_mode();
1055
1056 /* Initialize the PAT MSR. */
1057 pmap_init_pat();
1058
1059 /* set up CPU registers and state */
1060 cpu_setregs();
1061
1062 /* set up SSE/NX */
1063 initializecpu();
1064
1065 /* set up FPU state on the AP */
1066 #ifdef __amd64__
1067 fpuinit();
1068 #else
1069 npxinit(false);
1070 #endif
1071
1072 if (cpu_ops.cpu_init)
1073 cpu_ops.cpu_init();
1074
1075 /* A quick check from sanity claus */
1076 cpuid = PCPU_GET(cpuid);
1077 if (PCPU_GET(apic_id) != lapic_id()) {
1078 printf("SMP: cpuid = %d\n", cpuid);
1079 printf("SMP: actual apic_id = %d\n", lapic_id());
1080 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
1081 panic("cpuid mismatch! boom!!");
1082 }
1083
1084 /* Initialize curthread. */
1085 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
1086 PCPU_SET(curthread, PCPU_GET(idlethread));
1087 schedinit_ap();
1088
1089 mca_init();
1090
1091 /* Init local apic for irq's */
1092 lapic_setup(1);
1093
1094 /* Set memory range attributes for this CPU to match the BSP */
1095 mem_range_AP_init();
1096
1097 /*
1098 * Use naive spinning lock instead of the real spinlock, since
1099 * printfs() below might take a very long time and trigger
1100 * spinlock timeout panics. This is the only use of the
1101 * ap_boot_lock anyway.
1102 */
1103 while (atomic_cmpset_acq_int(&ap_boot_lock, 0, 1) == 0)
1104 ia32_pause();
1105
1106 smp_cpus++;
1107
1108 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
1109 if (bootverbose)
1110 printf("SMP: AP CPU #%d Launched!\n", cpuid);
1111 else
1112 printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "",
1113 cpuid, smp_cpus == mp_ncpus ? "\n" : " ");
1114
1115 /* Determine if we are a logical CPU. */
1116 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
1117 CPU_SET(cpuid, &logical_cpus_mask);
1118
1119 if (bootverbose)
1120 lapic_dump("AP");
1121
1122 if (smp_cpus == mp_ncpus) {
1123 /* enable IPI's, tlb shootdown, freezes etc */
1124 atomic_store_rel_int(&smp_started, 1);
1125 }
1126
1127 atomic_store_rel_int(&ap_boot_lock, 0);
1128
1129 #ifdef __amd64__
1130 if (pmap_pcid_enabled)
1131 load_cr4(rcr4() | CR4_PCIDE);
1132 load_ds(_udatasel);
1133 load_es(_udatasel);
1134 load_fs(_ufssel);
1135 #endif
1136
1137 /* Wait until all the AP's are up. */
1138 while (atomic_load_acq_int(&smp_started) == 0)
1139 ia32_pause();
1140
1141 kcsan_cpu_init(cpuid);
1142
1143 sched_ap_entry();
1144
1145 panic("scheduler returned us to %s", __func__);
1146 /* NOTREACHED */
1147 }
1148
1149 static void
smp_after_idle_runnable(void * arg __unused)1150 smp_after_idle_runnable(void *arg __unused)
1151 {
1152 int cpu;
1153
1154 if (mp_ncpus == 1)
1155 return;
1156
1157 KASSERT(smp_started != 0, ("%s: SMP not started yet", __func__));
1158
1159 /*
1160 * Wait for all APs to handle an interrupt. After that, we know that
1161 * the APs have entered the scheduler at least once, so the boot stacks
1162 * are safe to free.
1163 */
1164 smp_rendezvous(smp_no_rendezvous_barrier, NULL,
1165 smp_no_rendezvous_barrier, NULL);
1166
1167 for (cpu = 1; cpu < mp_ncpus; cpu++) {
1168 kmem_free(bootstacks[cpu], kstack_pages * PAGE_SIZE);
1169 }
1170 }
1171 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY,
1172 smp_after_idle_runnable, NULL);
1173
1174 /*
1175 * We tell the I/O APIC code about all the CPUs we want to receive
1176 * interrupts. If we don't want certain CPUs to receive IRQs we
1177 * can simply not tell the I/O APIC code about them in this function.
1178 * We also do not tell it about the BSP since it tells itself about
1179 * the BSP internally to work with UP kernels and on UP machines.
1180 */
1181 void
set_interrupt_apic_ids(void)1182 set_interrupt_apic_ids(void)
1183 {
1184 u_int i, apic_id;
1185
1186 for (i = 0; i < MAXCPU; i++) {
1187 apic_id = cpu_apic_ids[i];
1188 if (apic_id == -1)
1189 continue;
1190 if (cpu_info[apic_id].cpu_bsp)
1191 continue;
1192 if (cpu_info[apic_id].cpu_disabled)
1193 continue;
1194 if (intr_apic_id_limit >= 0 && apic_id > intr_apic_id_limit)
1195 continue;
1196
1197 /* Don't let hyperthreads service interrupts. */
1198 if (cpu_info[apic_id].cpu_hyperthread &&
1199 !hyperthreading_intr_allowed)
1200 continue;
1201
1202 intr_add_cpu(i);
1203 }
1204 }
1205
1206 #ifdef COUNT_XINVLTLB_HITS
1207 u_int xhits_gbl[MAXCPU];
1208 u_int xhits_pg[MAXCPU];
1209 u_int xhits_rng[MAXCPU];
1210 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1211 "");
1212 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
1213 sizeof(xhits_gbl), "IU", "");
1214 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
1215 sizeof(xhits_pg), "IU", "");
1216 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
1217 sizeof(xhits_rng), "IU", "");
1218
1219 u_int ipi_global;
1220 u_int ipi_page;
1221 u_int ipi_range;
1222 u_int ipi_range_size;
1223 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
1224 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
1225 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
1226 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
1227 0, "");
1228 #endif /* COUNT_XINVLTLB_HITS */
1229
1230 /*
1231 * Init and startup IPI.
1232 */
1233 void
ipi_startup(int apic_id,int vector)1234 ipi_startup(int apic_id, int vector)
1235 {
1236
1237 /*
1238 * This attempts to follow the algorithm described in the
1239 * Intel Multiprocessor Specification v1.4 in section B.4.
1240 * For each IPI, we allow the local APIC ~20us to deliver the
1241 * IPI. If that times out, we panic.
1242 */
1243
1244 /*
1245 * first we do an INIT IPI: this INIT IPI might be run, resetting
1246 * and running the target CPU. OR this INIT IPI might be latched (P5
1247 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
1248 * ignored.
1249 */
1250 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
1251 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
1252 lapic_ipi_wait(100);
1253
1254 /* Explicitly deassert the INIT IPI. */
1255 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
1256 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
1257 apic_id);
1258
1259 DELAY(10000); /* wait ~10mS */
1260
1261 /*
1262 * next we do a STARTUP IPI: the previous INIT IPI might still be
1263 * latched, (P5 bug) this 1st STARTUP would then terminate
1264 * immediately, and the previously started INIT IPI would continue. OR
1265 * the previous INIT IPI has already run. and this STARTUP IPI will
1266 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
1267 * will run.
1268 */
1269 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1270 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1271 vector, apic_id);
1272 if (!lapic_ipi_wait(100))
1273 panic("Failed to deliver first STARTUP IPI to APIC %d",
1274 apic_id);
1275 DELAY(200); /* wait ~200uS */
1276
1277 /*
1278 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
1279 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
1280 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
1281 * recognized after hardware RESET or INIT IPI.
1282 */
1283 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1284 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1285 vector, apic_id);
1286 if (!lapic_ipi_wait(100))
1287 panic("Failed to deliver second STARTUP IPI to APIC %d",
1288 apic_id);
1289
1290 DELAY(200); /* wait ~200uS */
1291 }
1292
1293 static bool
ipi_bitmap_set(int cpu,u_int ipi)1294 ipi_bitmap_set(int cpu, u_int ipi)
1295 {
1296 u_int bitmap, old, new;
1297 u_int *cpu_bitmap;
1298
1299 bitmap = 1 << ipi;
1300 cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap;
1301 old = *cpu_bitmap;
1302 for (;;) {
1303 if ((old & bitmap) != 0)
1304 break;
1305 new = old | bitmap;
1306 if (atomic_fcmpset_int(cpu_bitmap, &old, new))
1307 break;
1308 }
1309 return (old != 0);
1310 }
1311
1312 /*
1313 * Send an IPI to specified CPU handling the bitmap logic.
1314 */
1315 static void
ipi_send_cpu(int cpu,u_int ipi)1316 ipi_send_cpu(int cpu, u_int ipi)
1317 {
1318
1319 KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1,
1320 ("IPI to non-existent CPU %d", cpu));
1321
1322 if (IPI_IS_BITMAPED(ipi)) {
1323 if (ipi_bitmap_set(cpu, ipi))
1324 return;
1325 ipi = IPI_BITMAP_VECTOR;
1326 }
1327 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1328 }
1329
1330 void
ipi_bitmap_handler(struct trapframe frame)1331 ipi_bitmap_handler(struct trapframe frame)
1332 {
1333 struct trapframe *oldframe;
1334 struct thread *td;
1335 int cpu = PCPU_GET(cpuid);
1336 u_int ipi_bitmap;
1337
1338 kasan_mark(&frame, sizeof(frame), sizeof(frame), 0);
1339
1340 td = curthread;
1341 ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]->
1342 pc_ipi_bitmap);
1343
1344 /*
1345 * sched_preempt() must be called to clear the pending preempt
1346 * IPI to enable delivery of further preempts. However, the
1347 * critical section will cause extra scheduler lock thrashing
1348 * when used unconditionally. Only critical_enter() if
1349 * hardclock must also run, which requires the section entry.
1350 */
1351 if (ipi_bitmap & (1 << IPI_HARDCLOCK))
1352 critical_enter();
1353
1354 td->td_intr_nesting_level++;
1355 oldframe = td->td_intr_frame;
1356 td->td_intr_frame = &frame;
1357 #if defined(STACK) || defined(DDB)
1358 if (ipi_bitmap & (1 << IPI_TRACE))
1359 stack_capture_intr();
1360 #endif
1361 if (ipi_bitmap & (1 << IPI_PREEMPT)) {
1362 #ifdef COUNT_IPIS
1363 (*ipi_preempt_counts[cpu])++;
1364 #endif
1365 sched_preempt(td);
1366 }
1367 if (ipi_bitmap & (1 << IPI_AST)) {
1368 #ifdef COUNT_IPIS
1369 (*ipi_ast_counts[cpu])++;
1370 #endif
1371 /* Nothing to do for AST */
1372 }
1373 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
1374 #ifdef COUNT_IPIS
1375 (*ipi_hardclock_counts[cpu])++;
1376 #endif
1377 hardclockintr();
1378 }
1379 td->td_intr_frame = oldframe;
1380 td->td_intr_nesting_level--;
1381 if (ipi_bitmap & (1 << IPI_HARDCLOCK))
1382 critical_exit();
1383 }
1384
1385 /*
1386 * send an IPI to a set of cpus.
1387 */
1388 void
ipi_selected(cpuset_t cpus,u_int ipi)1389 ipi_selected(cpuset_t cpus, u_int ipi)
1390 {
1391 int cpu;
1392
1393 /*
1394 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1395 * of help in order to understand what is the source.
1396 * Set the mask of receiving CPUs for this purpose.
1397 */
1398 if (ipi == IPI_STOP_HARD)
1399 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
1400
1401 CPU_FOREACH_ISSET(cpu, &cpus) {
1402 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1403 ipi_send_cpu(cpu, ipi);
1404 }
1405 }
1406
1407 /*
1408 * send an IPI to a specific CPU.
1409 */
1410 void
ipi_cpu(int cpu,u_int ipi)1411 ipi_cpu(int cpu, u_int ipi)
1412 {
1413
1414 /*
1415 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1416 * of help in order to understand what is the source.
1417 * Set the mask of receiving CPUs for this purpose.
1418 */
1419 if (ipi == IPI_STOP_HARD)
1420 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
1421
1422 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1423 ipi_send_cpu(cpu, ipi);
1424 }
1425
1426 /*
1427 * send an IPI to all CPUs EXCEPT myself
1428 */
1429 void
ipi_all_but_self(u_int ipi)1430 ipi_all_but_self(u_int ipi)
1431 {
1432 cpuset_t other_cpus;
1433 int cpu, c;
1434
1435 if (mp_ncpus == 1)
1436 return;
1437
1438 /*
1439 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1440 * of help in order to understand what is the source.
1441 * Set the mask of receiving CPUs for this purpose.
1442 */
1443 if (ipi == IPI_STOP_HARD) {
1444 other_cpus = all_cpus;
1445 CPU_CLR(PCPU_GET(cpuid), &other_cpus);
1446 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
1447 }
1448
1449 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1450 if (IPI_IS_BITMAPED(ipi)) {
1451 cpu = PCPU_GET(cpuid);
1452 CPU_FOREACH(c) {
1453 if (c != cpu)
1454 ipi_bitmap_set(c, ipi);
1455 }
1456 ipi = IPI_BITMAP_VECTOR;
1457 }
1458 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1459 }
1460
1461 void
ipi_self_from_nmi(u_int vector)1462 ipi_self_from_nmi(u_int vector)
1463 {
1464
1465 lapic_ipi_vectored(vector, APIC_IPI_DEST_SELF);
1466
1467 /* Wait for IPI to finish. */
1468 if (!lapic_ipi_wait(50000)) {
1469 if (KERNEL_PANICKED())
1470 return;
1471 else
1472 panic("APIC: IPI is stuck");
1473 }
1474 }
1475
1476 int
ipi_nmi_handler(void)1477 ipi_nmi_handler(void)
1478 {
1479 u_int cpuid;
1480
1481 /*
1482 * As long as there is not a simple way to know about a NMI's
1483 * source, if the bitmask for the current CPU is present in
1484 * the global pending bitword an IPI_STOP_HARD has been issued
1485 * and should be handled.
1486 */
1487 cpuid = PCPU_GET(cpuid);
1488 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
1489 return (1);
1490
1491 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
1492 cpustop_handler();
1493 return (0);
1494 }
1495
1496 int nmi_kdb_lock;
1497
1498 void
nmi_call_kdb_smp(u_int type,struct trapframe * frame)1499 nmi_call_kdb_smp(u_int type, struct trapframe *frame)
1500 {
1501 int cpu;
1502 bool call_post;
1503
1504 cpu = PCPU_GET(cpuid);
1505 if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
1506 nmi_call_kdb(cpu, type, frame);
1507 call_post = false;
1508 } else {
1509 savectx(&stoppcbs[cpu]);
1510 CPU_SET_ATOMIC(cpu, &stopped_cpus);
1511 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
1512 ia32_pause();
1513 call_post = true;
1514 }
1515 atomic_store_rel_int(&nmi_kdb_lock, 0);
1516 if (call_post)
1517 cpustop_handler_post(cpu);
1518 }
1519
1520 /*
1521 * Handle an IPI_STOP by saving our current context and spinning (or mwaiting,
1522 * if available) until we are resumed.
1523 */
1524 void
cpustop_handler(void)1525 cpustop_handler(void)
1526 {
1527 struct monitorbuf *mb;
1528 u_int cpu;
1529 bool use_mwait;
1530
1531 cpu = PCPU_GET(cpuid);
1532
1533 savectx(&stoppcbs[cpu]);
1534
1535 use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 &&
1536 !mwait_cpustop_broken);
1537 if (use_mwait) {
1538 mb = PCPU_PTR(monitorbuf);
1539 atomic_store_int(&mb->stop_state,
1540 MONITOR_STOPSTATE_STOPPED);
1541 }
1542
1543 /* Indicate that we are stopped */
1544 CPU_SET_ATOMIC(cpu, &stopped_cpus);
1545
1546 /* Wait for restart */
1547 while (!CPU_ISSET(cpu, &started_cpus)) {
1548 if (use_mwait) {
1549 cpu_monitor(mb, 0, 0);
1550 if (atomic_load_int(&mb->stop_state) ==
1551 MONITOR_STOPSTATE_STOPPED)
1552 cpu_mwait(0, MWAIT_C1);
1553 continue;
1554 }
1555
1556 ia32_pause();
1557
1558 /*
1559 * Halt non-BSP CPUs on panic -- we're never going to need them
1560 * again, and might as well save power / release resources
1561 * (e.g., overprovisioned VM infrastructure).
1562 */
1563 while (__predict_false(!IS_BSP() && KERNEL_PANICKED()))
1564 halt();
1565 }
1566
1567 cpustop_handler_post(cpu);
1568 }
1569
1570 static void
cpustop_handler_post(u_int cpu)1571 cpustop_handler_post(u_int cpu)
1572 {
1573
1574 CPU_CLR_ATOMIC(cpu, &started_cpus);
1575 CPU_CLR_ATOMIC(cpu, &stopped_cpus);
1576
1577 /*
1578 * We don't broadcast TLB invalidations to other CPUs when they are
1579 * stopped. Hence, we clear the TLB before resuming.
1580 */
1581 invltlb_glob();
1582
1583 #if defined(__amd64__) && (defined(DDB) || defined(GDB))
1584 amd64_db_resume_dbreg();
1585 #endif
1586
1587 if (cpu == 0 && cpustop_restartfunc != NULL) {
1588 cpustop_restartfunc();
1589 cpustop_restartfunc = NULL;
1590 }
1591 }
1592
1593 /*
1594 * Handle an IPI_SUSPEND by saving our current context and spinning until we
1595 * are resumed.
1596 */
1597 void
cpususpend_handler(void)1598 cpususpend_handler(void)
1599 {
1600 u_int cpu;
1601
1602 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
1603
1604 #ifdef __amd64__
1605 if (vmm_suspend_p)
1606 vmm_suspend_p();
1607 #endif
1608
1609 cpu = PCPU_GET(cpuid);
1610
1611 #ifdef XENHVM
1612 /*
1613 * Some Xen guest types (PVH) expose a very minimal set of ACPI tables,
1614 * and for example have no support for SCI. That leads to the suspend
1615 * stacks not being allocated, and hence when attempting to perform a
1616 * Xen triggered suspension FreeBSD will hit a #PF. Avoid saving the
1617 * CPU and FPU contexts if the stacks are not allocated, as the
1618 * hypervisor will already take care of this. Note that we could even
1619 * do this for Xen triggered suspensions on guests that have full ACPI
1620 * support, but doing so would introduce extra complexity.
1621 */
1622 if (susppcbs == NULL) {
1623 KASSERT(vm_guest == VM_GUEST_XEN, ("Missing suspend stack"));
1624 CPU_SET_ATOMIC(cpu, &suspended_cpus);
1625 CPU_SET_ATOMIC(cpu, &resuming_cpus);
1626 } else
1627 #endif
1628 if (savectx(&susppcbs[cpu]->sp_pcb)) {
1629 #ifdef __amd64__
1630 fpususpend(susppcbs[cpu]->sp_fpususpend);
1631 #else
1632 npxsuspend(susppcbs[cpu]->sp_fpususpend);
1633 #endif
1634 /*
1635 * suspended_cpus is cleared shortly after each AP is restarted
1636 * by a Startup IPI, so that the BSP can proceed to restarting
1637 * the next AP.
1638 *
1639 * resuming_cpus gets cleared when the AP completes
1640 * initialization after having been released by the BSP.
1641 * resuming_cpus is probably not the best name for the
1642 * variable, because it is actually a set of processors that
1643 * haven't resumed yet and haven't necessarily started resuming.
1644 *
1645 * Note that suspended_cpus is meaningful only for ACPI suspend
1646 * as it's not really used for Xen suspend since the APs are
1647 * automatically restored to the running state and the correct
1648 * context. For the same reason resumectx is never called in
1649 * that case.
1650 */
1651 CPU_SET_ATOMIC(cpu, &suspended_cpus);
1652 CPU_SET_ATOMIC(cpu, &resuming_cpus);
1653
1654 /*
1655 * Invalidate the cache after setting the global status bits.
1656 * The last AP to set its bit may end up being an Owner of the
1657 * corresponding cache line in MOESI protocol. The AP may be
1658 * stopped before the cache line is written to the main memory.
1659 */
1660 wbinvd();
1661 } else {
1662 #ifdef __amd64__
1663 fpuresume(susppcbs[cpu]->sp_fpususpend);
1664 #else
1665 npxresume(susppcbs[cpu]->sp_fpususpend);
1666 #endif
1667 pmap_init_pat();
1668 initializecpu();
1669 PCPU_SET(switchtime, 0);
1670 PCPU_SET(switchticks, ticks);
1671
1672 /* Indicate that we have restarted and restored the context. */
1673 CPU_CLR_ATOMIC(cpu, &suspended_cpus);
1674 }
1675
1676 /* Wait for resume directive */
1677 while (!CPU_ISSET(cpu, &toresume_cpus))
1678 ia32_pause();
1679
1680 /* Re-apply microcode updates. */
1681 ucode_reload();
1682
1683 #ifdef __i386__
1684 /* Finish removing the identity mapping of low memory for this AP. */
1685 invltlb_glob();
1686 #endif
1687
1688 if (cpu_ops.cpu_resume)
1689 cpu_ops.cpu_resume();
1690 #ifdef __amd64__
1691 if (vmm_resume_p)
1692 vmm_resume_p();
1693 #endif
1694
1695 /* Resume MCA and local APIC */
1696 lapic_xapic_mode();
1697 mca_resume();
1698 lapic_setup(0);
1699
1700 /* Indicate that we are resumed */
1701 CPU_CLR_ATOMIC(cpu, &resuming_cpus);
1702 CPU_CLR_ATOMIC(cpu, &suspended_cpus);
1703 CPU_CLR_ATOMIC(cpu, &toresume_cpus);
1704 }
1705
1706 void
cpuoff_handler(void)1707 cpuoff_handler(void)
1708 {
1709 u_int cpu;
1710
1711 cpu = PCPU_GET(cpuid);
1712
1713 /* Time to go catatonic. A reset will be required to leave. */
1714 disable_intr();
1715 lapic_disable();
1716 CPU_SET_ATOMIC(cpu, &suspended_cpus);
1717
1718 /*
1719 * There technically should be no need for the `while` here, since it
1720 * cannot be interrupted (interrupts are disabled). Be safe anyway.
1721 * Any interrupt at this point will likely be fatal, as the page tables
1722 * are likely going away shortly.
1723 */
1724 while (1)
1725 halt();
1726 }
1727
1728 /*
1729 * Handle an IPI_SWI by waking delayed SWI thread.
1730 */
1731 void
ipi_swi_handler(struct trapframe frame)1732 ipi_swi_handler(struct trapframe frame)
1733 {
1734
1735 intr_event_handle(clk_intr_event, &frame);
1736 }
1737
1738 /*
1739 * This is called once the rest of the system is up and running and we're
1740 * ready to let the AP's out of the pen.
1741 */
1742 static void
release_aps(void * dummy __unused)1743 release_aps(void *dummy __unused)
1744 {
1745
1746 if (mp_ncpus == 1)
1747 return;
1748 atomic_store_rel_int(&aps_ready, 1);
1749 while (smp_started == 0)
1750 ia32_pause();
1751 }
1752 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1753
1754 #ifdef COUNT_IPIS
1755 /*
1756 * Setup interrupt counters for IPI handlers.
1757 */
1758 static void
mp_ipi_intrcnt(void * dummy)1759 mp_ipi_intrcnt(void *dummy)
1760 {
1761 char buf[64];
1762 int i;
1763
1764 CPU_FOREACH(i) {
1765 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
1766 intrcnt_add(buf, &ipi_invltlb_counts[i]);
1767 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
1768 intrcnt_add(buf, &ipi_invlrng_counts[i]);
1769 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
1770 intrcnt_add(buf, &ipi_invlpg_counts[i]);
1771 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
1772 intrcnt_add(buf, &ipi_invlcache_counts[i]);
1773 snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
1774 intrcnt_add(buf, &ipi_preempt_counts[i]);
1775 snprintf(buf, sizeof(buf), "cpu%d:ast", i);
1776 intrcnt_add(buf, &ipi_ast_counts[i]);
1777 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
1778 intrcnt_add(buf, &ipi_rendezvous_counts[i]);
1779 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
1780 intrcnt_add(buf, &ipi_hardclock_counts[i]);
1781 }
1782 }
1783 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
1784 #endif
1785