1 /*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2003, by Peter Wemm
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 * derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 #include "opt_acpi.h"
29 #ifdef __i386__
30 #include "opt_apic.h"
31 #endif
32 #include "opt_cpu.h"
33 #include "opt_ddb.h"
34 #include "opt_gdb.h"
35 #include "opt_kstack_pages.h"
36 #include "opt_pmap.h"
37 #include "opt_sched.h"
38 #include "opt_smp.h"
39 #include "opt_stack.h"
40
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/asan.h>
44 #include <sys/bus.h>
45 #include <sys/cons.h> /* cngetc() */
46 #include <sys/cpuset.h>
47 #include <sys/csan.h>
48 #include <sys/interrupt.h>
49 #include <sys/kdb.h>
50 #include <sys/kernel.h>
51 #include <sys/ktr.h>
52 #include <sys/lock.h>
53 #include <sys/malloc.h>
54 #include <sys/memrange.h>
55 #include <sys/mutex.h>
56 #include <sys/pcpu.h>
57 #include <sys/proc.h>
58 #include <sys/sched.h>
59 #include <sys/smp.h>
60 #include <sys/sysctl.h>
61
62 #include <vm/vm.h>
63 #include <vm/vm_param.h>
64 #include <vm/pmap.h>
65 #include <vm/vm_kern.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68
69 #include <x86/apicreg.h>
70 #include <machine/clock.h>
71 #include <machine/cpu.h>
72 #include <machine/cputypes.h>
73 #include <x86/mca.h>
74 #include <machine/md_var.h>
75 #include <machine/pcb.h>
76 #include <machine/psl.h>
77 #include <machine/smp.h>
78 #include <machine/specialreg.h>
79 #include <machine/stack.h>
80 #include <x86/ucode.h>
81
82 #ifdef DEV_ACPI
83 #include <contrib/dev/acpica/include/acpi.h>
84 #include <dev/acpica/acpivar.h>
85 #endif
86
87 static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items");
88
89 int mp_naps; /* # of Applications processors */
90 int boot_cpu_id = -1; /* designated BSP */
91
92 /* AP uses this during bootstrap. Do not staticize. */
93 char *bootSTK;
94 int bootAP;
95
96 /* Free these after use */
97 void *bootstacks[MAXCPU];
98 void *dpcpu;
99
100 struct susppcb **susppcbs;
101
102 #ifdef COUNT_IPIS
103 /* Interrupt counts. */
104 static u_long *ipi_preempt_counts[MAXCPU];
105 static u_long *ipi_ast_counts[MAXCPU];
106 u_long *ipi_invltlb_counts[MAXCPU];
107 u_long *ipi_invlrng_counts[MAXCPU];
108 u_long *ipi_invlpg_counts[MAXCPU];
109 u_long *ipi_invlcache_counts[MAXCPU];
110 u_long *ipi_rendezvous_counts[MAXCPU];
111 static u_long *ipi_hardclock_counts[MAXCPU];
112 #endif
113
114 /* Default cpu_ops implementation. */
115 struct cpu_ops cpu_ops;
116
117 /*
118 * Local data and functions.
119 */
120
121 static volatile cpuset_t ipi_stop_nmi_pending;
122
123 volatile cpuset_t resuming_cpus;
124 volatile cpuset_t toresume_cpus;
125
126 /* used to hold the AP's until we are ready to release them */
127 struct mtx ap_boot_mtx;
128
129 /* Set to 1 once we're ready to let the APs out of the pen. */
130 volatile int aps_ready = 0;
131
132 /*
133 * Store data from cpu_add() until later in the boot when we actually setup
134 * the APs.
135 */
136 struct cpu_info *cpu_info;
137 int *apic_cpuids;
138 int cpu_apic_ids[MAXCPU];
139 _Static_assert(MAXCPU <= MAX_APIC_ID,
140 "MAXCPU cannot be larger that MAX_APIC_ID");
141 _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID,
142 "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID");
143
144 static void release_aps(void *dummy);
145 static void cpustop_handler_post(u_int cpu);
146
147 static int hyperthreading_allowed = 1;
148 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
149 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
150
151 static int hyperthreading_intr_allowed = 0;
152 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN,
153 &hyperthreading_intr_allowed, 0,
154 "Allow interrupts on HTT logical CPUs");
155
156 static int intr_apic_id_limit = -1;
157 SYSCTL_INT(_machdep, OID_AUTO, intr_apic_id_limit, CTLFLAG_RDTUN,
158 &intr_apic_id_limit, 0,
159 "Maximum permitted APIC ID for interrupt delivery (-1 is unlimited)");
160
161 static struct topo_node topo_root;
162
163 static int pkg_id_shift;
164 static int node_id_shift;
165 static int core_id_shift;
166 static int disabled_cpus;
167
168 struct cache_info {
169 int id_shift;
170 int present;
171 } static caches[MAX_CACHE_LEVELS];
172
173 static bool stop_mwait = false;
174 SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0,
175 "Use MONITOR/MWAIT when stopping CPU, if available");
176
177 void
mem_range_AP_init(void)178 mem_range_AP_init(void)
179 {
180
181 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
182 mem_range_softc.mr_op->initAP(&mem_range_softc);
183 }
184
185 /*
186 * Compute ceil(log2(x)). Returns -1 if x is zero.
187 */
188 static __inline int
mask_width(u_int x)189 mask_width(u_int x)
190 {
191
192 return (x == 0 ? -1 : order_base_2(x));
193 }
194
195 /*
196 * Add a cache level to the cache topology description.
197 */
198 static int
add_deterministic_cache(int type,int level,int share_count)199 add_deterministic_cache(int type, int level, int share_count)
200 {
201
202 if (type == 0)
203 return (0);
204 if (type > 3) {
205 printf("unexpected cache type %d\n", type);
206 return (1);
207 }
208 if (type == 2) /* ignore instruction cache */
209 return (1);
210 if (level == 0 || level > MAX_CACHE_LEVELS) {
211 printf("unexpected cache level %d\n", level);
212 return (1);
213 }
214
215 if (caches[level - 1].present) {
216 printf("WARNING: multiple entries for L%u data cache\n", level);
217 printf("%u => %u\n", caches[level - 1].id_shift,
218 mask_width(share_count));
219 }
220 caches[level - 1].id_shift = mask_width(share_count);
221 caches[level - 1].present = 1;
222
223 if (caches[level - 1].id_shift > pkg_id_shift) {
224 printf("WARNING: L%u data cache covers more "
225 "APIC IDs than a package (%u > %u)\n", level,
226 caches[level - 1].id_shift, pkg_id_shift);
227 caches[level - 1].id_shift = pkg_id_shift;
228 }
229 if (caches[level - 1].id_shift < core_id_shift) {
230 printf("WARNING: L%u data cache covers fewer "
231 "APIC IDs than a core (%u < %u)\n", level,
232 caches[level - 1].id_shift, core_id_shift);
233 caches[level - 1].id_shift = core_id_shift;
234 }
235
236 return (1);
237 }
238
239 /*
240 * Determine topology of processing units and caches for AMD CPUs.
241 * See:
242 * - AMD CPUID Specification (Publication # 25481)
243 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
244 * - BKDG For AMD Family 10h Processors (Publication # 31116)
245 * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301)
246 * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751)
247 * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945)
248 */
249 static void
topo_probe_amd(void)250 topo_probe_amd(void)
251 {
252 u_int p[4];
253 uint64_t v;
254 int level;
255 int nodes_per_socket;
256 int share_count;
257 int type;
258 int i;
259
260 /* No multi-core capability. */
261 if ((amd_feature2 & AMDID2_CMP) == 0)
262 return;
263
264 /*
265 * XXX Lack of an AMD IOMMU driver prevents use of APIC IDs above
266 * xAPIC_MAX_APIC_ID. This is a workaround so we boot and function on
267 * AMD systems with high thread counts, albeit with reduced interrupt
268 * performance.
269 *
270 * We should really set the limit to xAPIC_MAX_APIC_ID by default, and
271 * have the IOMMU driver increase it. That way if a driver is present
272 * but disabled, or is otherwise not able to route the interrupts, the
273 * system can fall back to a functional state. That will require a more
274 * substantial change though, including having the IOMMU initialize
275 * earlier.
276 */
277 if (intr_apic_id_limit == -1)
278 intr_apic_id_limit = xAPIC_MAX_APIC_ID;
279
280 /* For families 10h and newer. */
281 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
282 AMDID_COREID_SIZE_SHIFT;
283
284 /* For 0Fh family. */
285 if (pkg_id_shift == 0)
286 pkg_id_shift =
287 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
288
289 /*
290 * Families prior to 16h define the following value as
291 * cores per compute unit and we don't really care about the AMD
292 * compute units at the moment. Perhaps we should treat them as
293 * cores and cores within the compute units as hardware threads,
294 * but that's up for debate.
295 * Later families define the value as threads per compute unit,
296 * so we are following AMD's nomenclature here.
297 */
298 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 &&
299 CPUID_TO_FAMILY(cpu_id) >= 0x16) {
300 cpuid_count(0x8000001e, 0, p);
301 share_count = ((p[1] >> 8) & 0xff) + 1;
302 core_id_shift = mask_width(share_count);
303
304 /*
305 * For Zen (17h), gather Nodes per Processor. Each node is a
306 * Zeppelin die; TR and EPYC CPUs will have multiple dies per
307 * package. Communication latency between dies is higher than
308 * within them.
309 */
310 nodes_per_socket = ((p[2] >> 8) & 0x7) + 1;
311 node_id_shift = pkg_id_shift - mask_width(nodes_per_socket);
312 }
313
314 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
315 for (i = 0; ; i++) {
316 cpuid_count(0x8000001d, i, p);
317 type = p[0] & 0x1f;
318 level = (p[0] >> 5) & 0x7;
319 share_count = 1 + ((p[0] >> 14) & 0xfff);
320
321 if (!add_deterministic_cache(type, level, share_count))
322 break;
323 }
324 } else {
325 if (cpu_exthigh >= 0x80000005) {
326 cpuid_count(0x80000005, 0, p);
327 if (((p[2] >> 24) & 0xff) != 0) {
328 caches[0].id_shift = 0;
329 caches[0].present = 1;
330 }
331 }
332 if (cpu_exthigh >= 0x80000006) {
333 cpuid_count(0x80000006, 0, p);
334 if (((p[2] >> 16) & 0xffff) != 0) {
335 caches[1].id_shift = 0;
336 caches[1].present = 1;
337 }
338 if (((p[3] >> 18) & 0x3fff) != 0) {
339 nodes_per_socket = 1;
340 if ((amd_feature2 & AMDID2_NODE_ID) != 0) {
341 /*
342 * Handle multi-node processors that
343 * have multiple chips, each with its
344 * own L3 cache, on the same die.
345 */
346 v = rdmsr(0xc001100c);
347 nodes_per_socket = 1 + ((v >> 3) & 0x7);
348 }
349 caches[2].id_shift =
350 pkg_id_shift - mask_width(nodes_per_socket);
351 caches[2].present = 1;
352 }
353 }
354 }
355 }
356
357 /*
358 * Determine topology of processing units for Intel CPUs
359 * using CPUID Leaf 1 and Leaf 4, if supported.
360 * See:
361 * - Intel 64 Architecture Processor Topology Enumeration
362 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
363 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
364 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
365 */
366 static void
topo_probe_intel_0x4(void)367 topo_probe_intel_0x4(void)
368 {
369 u_int p[4];
370 int max_cores;
371 int max_logical;
372
373 /* Both zero and one here mean one logical processor per package. */
374 max_logical = (cpu_feature & CPUID_HTT) != 0 ?
375 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
376 if (max_logical <= 1)
377 return;
378
379 if (cpu_high >= 0x4) {
380 cpuid_count(0x04, 0, p);
381 max_cores = ((p[0] >> 26) & 0x3f) + 1;
382 } else
383 max_cores = 1;
384
385 core_id_shift = mask_width(max_logical/max_cores);
386 KASSERT(core_id_shift >= 0,
387 ("intel topo: max_cores > max_logical\n"));
388 pkg_id_shift = core_id_shift + mask_width(max_cores);
389 }
390
391 /*
392 * Determine topology of processing units for Intel CPUs
393 * using CPUID Leaf 1Fh or 0Bh, if supported.
394 * See:
395 * - Intel 64 Architecture Processor Topology Enumeration
396 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
397 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
398 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
399 */
400 static void
topo_probe_intel_0xb(void)401 topo_probe_intel_0xb(void)
402 {
403 u_int leaf;
404 u_int p[4] = { 0 };
405 int bits;
406 int type;
407 int i;
408
409 /* Prefer leaf 1Fh (V2 Extended Topology Enumeration). */
410 if (cpu_high >= 0x1f) {
411 leaf = 0x1f;
412 cpuid_count(leaf, 0, p);
413 }
414 /* Fall back to leaf 0Bh (Extended Topology Enumeration). */
415 if (p[1] == 0) {
416 leaf = 0x0b;
417 cpuid_count(leaf, 0, p);
418 }
419 /* Fall back to leaf 04h (Deterministic Cache Parameters). */
420 if (p[1] == 0) {
421 topo_probe_intel_0x4();
422 return;
423 }
424
425 /* We only support three levels for now. */
426 for (i = 0; ; i++) {
427 cpuid_count(leaf, i, p);
428
429 bits = p[0] & 0x1f;
430 type = (p[2] >> 8) & 0xff;
431
432 if (type == 0)
433 break;
434
435 if (type == CPUID_TYPE_SMT)
436 core_id_shift = bits;
437 else if (type == CPUID_TYPE_CORE)
438 pkg_id_shift = bits;
439 else if (bootverbose)
440 printf("Topology level type %d shift: %d\n", type, bits);
441 }
442
443 if (pkg_id_shift < core_id_shift) {
444 printf("WARNING: core covers more APIC IDs than a package\n");
445 core_id_shift = pkg_id_shift;
446 }
447 }
448
449 /*
450 * Determine topology of caches for Intel CPUs.
451 * See:
452 * - Intel 64 Architecture Processor Topology Enumeration
453 * - Intel 64 and IA-32 Architectures Software Developer’s Manual
454 * Volume 2A: Instruction Set Reference, A-M,
455 * CPUID instruction
456 */
457 static void
topo_probe_intel_caches(void)458 topo_probe_intel_caches(void)
459 {
460 u_int p[4];
461 int level;
462 int share_count;
463 int type;
464 int i;
465
466 if (cpu_high < 0x4) {
467 /*
468 * Available cache level and sizes can be determined
469 * via CPUID leaf 2, but that requires a huge table of hardcoded
470 * values, so for now just assume L1 and L2 caches potentially
471 * shared only by HTT processing units, if HTT is present.
472 */
473 caches[0].id_shift = pkg_id_shift;
474 caches[0].present = 1;
475 caches[1].id_shift = pkg_id_shift;
476 caches[1].present = 1;
477 return;
478 }
479
480 for (i = 0; ; i++) {
481 cpuid_count(0x4, i, p);
482 type = p[0] & 0x1f;
483 level = (p[0] >> 5) & 0x7;
484 share_count = 1 + ((p[0] >> 14) & 0xfff);
485
486 if (!add_deterministic_cache(type, level, share_count))
487 break;
488 }
489 }
490
491 /*
492 * Determine topology of processing units and caches for Intel CPUs.
493 * See:
494 * - Intel 64 Architecture Processor Topology Enumeration
495 */
496 static void
topo_probe_intel(void)497 topo_probe_intel(void)
498 {
499
500 /*
501 * Note that 0x1 <= cpu_high < 4 case should be
502 * compatible with topo_probe_intel_0x4() logic when
503 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
504 * or it should trigger the fallback otherwise.
505 */
506 if (cpu_high >= 0xb)
507 topo_probe_intel_0xb();
508 else if (cpu_high >= 0x1)
509 topo_probe_intel_0x4();
510
511 topo_probe_intel_caches();
512 }
513
514 /*
515 * Topology information is queried only on BSP, on which this
516 * code runs and for which it can query CPUID information.
517 * Then topology is extrapolated on all packages using an
518 * assumption that APIC ID to hardware component ID mapping is
519 * homogenious.
520 * That doesn't necesserily imply that the topology is uniform.
521 */
522 void
topo_probe(void)523 topo_probe(void)
524 {
525 static int cpu_topo_probed = 0;
526 struct x86_topo_layer {
527 int type;
528 int subtype;
529 int id_shift;
530 } topo_layers[MAX_CACHE_LEVELS + 5];
531 struct topo_node *parent;
532 struct topo_node *node;
533 int layer;
534 int nlayers;
535 int node_id;
536 int i;
537 #if defined(DEV_ACPI) && MAXMEMDOM > 1
538 int d, domain;
539 #endif
540
541 if (cpu_topo_probed)
542 return;
543
544 CPU_ZERO(&logical_cpus_mask);
545
546 if (mp_ncpus <= 1)
547 ; /* nothing */
548 else if (cpu_vendor_id == CPU_VENDOR_AMD ||
549 cpu_vendor_id == CPU_VENDOR_HYGON)
550 topo_probe_amd();
551 else if (cpu_vendor_id == CPU_VENDOR_INTEL)
552 topo_probe_intel();
553
554 KASSERT(pkg_id_shift >= core_id_shift,
555 ("bug in APIC topology discovery"));
556
557 nlayers = 0;
558 bzero(topo_layers, sizeof(topo_layers));
559
560 topo_layers[nlayers].type = TOPO_TYPE_PKG;
561 topo_layers[nlayers].id_shift = pkg_id_shift;
562 if (bootverbose)
563 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
564 nlayers++;
565
566 if (pkg_id_shift > node_id_shift && node_id_shift != 0) {
567 topo_layers[nlayers].type = TOPO_TYPE_GROUP;
568 topo_layers[nlayers].id_shift = node_id_shift;
569 if (bootverbose)
570 printf("Node ID shift: %u\n",
571 topo_layers[nlayers].id_shift);
572 nlayers++;
573 }
574
575 /*
576 * Consider all caches to be within a package/chip
577 * and "in front" of all sub-components like
578 * cores and hardware threads.
579 */
580 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
581 if (caches[i].present) {
582 if (node_id_shift != 0)
583 KASSERT(caches[i].id_shift <= node_id_shift,
584 ("bug in APIC topology discovery"));
585 KASSERT(caches[i].id_shift <= pkg_id_shift,
586 ("bug in APIC topology discovery"));
587 KASSERT(caches[i].id_shift >= core_id_shift,
588 ("bug in APIC topology discovery"));
589
590 topo_layers[nlayers].type = TOPO_TYPE_CACHE;
591 topo_layers[nlayers].subtype = i + 1;
592 topo_layers[nlayers].id_shift = caches[i].id_shift;
593 if (bootverbose)
594 printf("L%u cache ID shift: %u\n",
595 topo_layers[nlayers].subtype,
596 topo_layers[nlayers].id_shift);
597 nlayers++;
598 }
599 }
600
601 if (pkg_id_shift > core_id_shift) {
602 topo_layers[nlayers].type = TOPO_TYPE_CORE;
603 topo_layers[nlayers].id_shift = core_id_shift;
604 if (bootverbose)
605 printf("Core ID shift: %u\n",
606 topo_layers[nlayers].id_shift);
607 nlayers++;
608 }
609
610 topo_layers[nlayers].type = TOPO_TYPE_PU;
611 topo_layers[nlayers].id_shift = 0;
612 nlayers++;
613
614 #if defined(DEV_ACPI) && MAXMEMDOM > 1
615 if (vm_ndomains > 1) {
616 for (layer = 0; layer < nlayers; ++layer) {
617 for (i = 0; i <= max_apic_id; ++i) {
618 if ((i & ((1 << topo_layers[layer].id_shift) - 1)) == 0)
619 domain = -1;
620 if (!cpu_info[i].cpu_present)
621 continue;
622 d = acpi_pxm_get_cpu_locality(i);
623 if (domain >= 0 && domain != d)
624 break;
625 domain = d;
626 }
627 if (i > max_apic_id)
628 break;
629 }
630 KASSERT(layer < nlayers, ("NUMA domain smaller than PU"));
631 memmove(&topo_layers[layer+1], &topo_layers[layer],
632 sizeof(*topo_layers) * (nlayers - layer));
633 topo_layers[layer].type = TOPO_TYPE_NODE;
634 topo_layers[layer].subtype = CG_SHARE_NONE;
635 nlayers++;
636 }
637 #endif
638
639 topo_init_root(&topo_root);
640 for (i = 0; i <= max_apic_id; ++i) {
641 if (!cpu_info[i].cpu_present)
642 continue;
643
644 parent = &topo_root;
645 for (layer = 0; layer < nlayers; ++layer) {
646 #if defined(DEV_ACPI) && MAXMEMDOM > 1
647 if (topo_layers[layer].type == TOPO_TYPE_NODE) {
648 node_id = acpi_pxm_get_cpu_locality(i);
649 } else
650 #endif
651 node_id = i >> topo_layers[layer].id_shift;
652 parent = topo_add_node_by_hwid(parent, node_id,
653 topo_layers[layer].type,
654 topo_layers[layer].subtype);
655 }
656 }
657
658 parent = &topo_root;
659 for (layer = 0; layer < nlayers; ++layer) {
660 #if defined(DEV_ACPI) && MAXMEMDOM > 1
661 if (topo_layers[layer].type == TOPO_TYPE_NODE)
662 node_id = acpi_pxm_get_cpu_locality(boot_cpu_id);
663 else
664 #endif
665 node_id = boot_cpu_id >> topo_layers[layer].id_shift;
666 node = topo_find_node_by_hwid(parent, node_id,
667 topo_layers[layer].type,
668 topo_layers[layer].subtype);
669 topo_promote_child(node);
670 parent = node;
671 }
672
673 cpu_topo_probed = 1;
674 }
675
676 /*
677 * Assign logical CPU IDs to local APICs.
678 */
679 void
assign_cpu_ids(void)680 assign_cpu_ids(void)
681 {
682 struct topo_node *node;
683 u_int smt_mask;
684 int nhyper;
685
686 smt_mask = (1u << core_id_shift) - 1;
687
688 /*
689 * Assign CPU IDs to local APIC IDs and disable any CPUs
690 * beyond MAXCPU. CPU 0 is always assigned to the BSP.
691 */
692 mp_ncpus = 0;
693 nhyper = 0;
694 TOPO_FOREACH(node, &topo_root) {
695 if (node->type != TOPO_TYPE_PU)
696 continue;
697
698 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
699 cpu_info[node->hwid].cpu_hyperthread = 1;
700
701 if (resource_disabled("lapic", node->hwid)) {
702 if (node->hwid != boot_cpu_id)
703 cpu_info[node->hwid].cpu_disabled = 1;
704 else
705 printf("Cannot disable BSP, APIC ID = %d\n",
706 node->hwid);
707 }
708
709 if (!hyperthreading_allowed &&
710 cpu_info[node->hwid].cpu_hyperthread)
711 cpu_info[node->hwid].cpu_disabled = 1;
712
713 if (mp_ncpus >= MAXCPU)
714 cpu_info[node->hwid].cpu_disabled = 1;
715
716 if (cpu_info[node->hwid].cpu_disabled) {
717 disabled_cpus++;
718 continue;
719 }
720
721 if (cpu_info[node->hwid].cpu_hyperthread)
722 nhyper++;
723
724 cpu_apic_ids[mp_ncpus] = node->hwid;
725 apic_cpuids[node->hwid] = mp_ncpus;
726 topo_set_pu_id(node, mp_ncpus);
727 mp_ncpus++;
728 }
729
730 KASSERT(mp_maxid >= mp_ncpus - 1,
731 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
732 mp_ncpus));
733
734 mp_ncores = mp_ncpus - nhyper;
735 smp_threads_per_core = mp_ncpus / mp_ncores;
736 }
737
738 /*
739 * Print various information about the SMP system hardware and setup.
740 */
741 void
cpu_mp_announce(void)742 cpu_mp_announce(void)
743 {
744 struct topo_node *node;
745 const char *hyperthread;
746 struct topo_analysis topology;
747
748 printf("FreeBSD/SMP: ");
749 if (topo_analyze(&topo_root, 1, &topology)) {
750 printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]);
751 if (topology.entities[TOPO_LEVEL_GROUP] > 1)
752 printf(" x %d groups",
753 topology.entities[TOPO_LEVEL_GROUP]);
754 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
755 printf(" x %d cache groups",
756 topology.entities[TOPO_LEVEL_CACHEGROUP]);
757 if (topology.entities[TOPO_LEVEL_CORE] > 0)
758 printf(" x %d core(s)",
759 topology.entities[TOPO_LEVEL_CORE]);
760 if (topology.entities[TOPO_LEVEL_THREAD] > 1)
761 printf(" x %d hardware threads",
762 topology.entities[TOPO_LEVEL_THREAD]);
763 } else {
764 printf("Non-uniform topology");
765 }
766 printf("\n");
767
768 if (disabled_cpus) {
769 printf("FreeBSD/SMP Online: ");
770 if (topo_analyze(&topo_root, 0, &topology)) {
771 printf("%d package(s)",
772 topology.entities[TOPO_LEVEL_PKG]);
773 if (topology.entities[TOPO_LEVEL_GROUP] > 1)
774 printf(" x %d groups",
775 topology.entities[TOPO_LEVEL_GROUP]);
776 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
777 printf(" x %d cache groups",
778 topology.entities[TOPO_LEVEL_CACHEGROUP]);
779 if (topology.entities[TOPO_LEVEL_CORE] > 0)
780 printf(" x %d core(s)",
781 topology.entities[TOPO_LEVEL_CORE]);
782 if (topology.entities[TOPO_LEVEL_THREAD] > 1)
783 printf(" x %d hardware threads",
784 topology.entities[TOPO_LEVEL_THREAD]);
785 } else {
786 printf("Non-uniform topology");
787 }
788 printf("\n");
789 }
790
791 if (!bootverbose)
792 return;
793
794 TOPO_FOREACH(node, &topo_root) {
795 switch (node->type) {
796 case TOPO_TYPE_PKG:
797 printf("Package HW ID = %u\n", node->hwid);
798 break;
799 case TOPO_TYPE_CORE:
800 printf("\tCore HW ID = %u\n", node->hwid);
801 break;
802 case TOPO_TYPE_PU:
803 if (cpu_info[node->hwid].cpu_hyperthread)
804 hyperthread = "/HT";
805 else
806 hyperthread = "";
807
808 if (node->subtype == 0)
809 printf("\t\tCPU (AP%s): APIC ID: %u"
810 "(disabled)\n", hyperthread, node->hwid);
811 else if (node->id == 0)
812 printf("\t\tCPU0 (BSP): APIC ID: %u\n",
813 node->hwid);
814 else
815 printf("\t\tCPU%u (AP%s): APIC ID: %u\n",
816 node->id, hyperthread, node->hwid);
817 break;
818 default:
819 /* ignored */
820 break;
821 }
822 }
823 }
824
825 /*
826 * Add a scheduling group, a group of logical processors sharing
827 * a particular cache (and, thus having an affinity), to the scheduling
828 * topology.
829 * This function recursively works on lower level caches.
830 */
831 static void
x86topo_add_sched_group(struct topo_node * root,struct cpu_group * cg_root)832 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
833 {
834 struct topo_node *node;
835 int nchildren;
836 int ncores;
837 int i;
838
839 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE ||
840 root->type == TOPO_TYPE_NODE || root->type == TOPO_TYPE_GROUP,
841 ("x86topo_add_sched_group: bad type: %u", root->type));
842 CPU_COPY(&root->cpuset, &cg_root->cg_mask);
843 cg_root->cg_count = root->cpu_count;
844 if (root->type == TOPO_TYPE_CACHE)
845 cg_root->cg_level = root->subtype;
846 else
847 cg_root->cg_level = CG_SHARE_NONE;
848 if (root->type == TOPO_TYPE_NODE)
849 cg_root->cg_flags = CG_FLAG_NODE;
850 else
851 cg_root->cg_flags = 0;
852
853 /*
854 * Check how many core nodes we have under the given root node.
855 * If we have multiple logical processors, but not multiple
856 * cores, then those processors must be hardware threads.
857 */
858 ncores = 0;
859 node = root;
860 while (node != NULL) {
861 if (node->type != TOPO_TYPE_CORE) {
862 node = topo_next_node(root, node);
863 continue;
864 }
865
866 ncores++;
867 node = topo_next_nonchild_node(root, node);
868 }
869
870 if (cg_root->cg_level != CG_SHARE_NONE &&
871 root->cpu_count > 1 && ncores < 2)
872 cg_root->cg_flags |= CG_FLAG_SMT;
873
874 /*
875 * Find out how many cache nodes we have under the given root node.
876 * We ignore cache nodes that cover all the same processors as the
877 * root node. Also, we do not descend below found cache nodes.
878 * That is, we count top-level "non-redundant" caches under the root
879 * node.
880 */
881 nchildren = 0;
882 node = root;
883 while (node != NULL) {
884 /*
885 * When some APICs are disabled by tunables, nodes can end up
886 * with an empty cpuset. Nodes with an empty cpuset will be
887 * translated into cpu groups with empty cpusets. smp_topo_fill
888 * will then set cg_first and cg_last to -1. This isn't
889 * correctly handled in all functions. E.g. when
890 * cpu_search_lowest and cpu_search_highest loop through all
891 * cpus, they call CPU_ISSET on cpu -1 which ends up in a
892 * general protection fault.
893 *
894 * We could fix the scheduler to handle empty cpu groups
895 * correctly. Nevertheless, empty cpu groups are causing
896 * overhead for no value. So, it makes more sense to just don't
897 * create them.
898 */
899 if (CPU_EMPTY(&node->cpuset)) {
900 node = topo_next_node(root, node);
901 continue;
902 }
903 if (CPU_CMP(&node->cpuset, &root->cpuset) == 0) {
904 if (node->type == TOPO_TYPE_CACHE &&
905 cg_root->cg_level < node->subtype)
906 cg_root->cg_level = node->subtype;
907 if (node->type == TOPO_TYPE_NODE)
908 cg_root->cg_flags |= CG_FLAG_NODE;
909 node = topo_next_node(root, node);
910 continue;
911 }
912 if (node->type != TOPO_TYPE_GROUP &&
913 node->type != TOPO_TYPE_NODE &&
914 node->type != TOPO_TYPE_CACHE) {
915 node = topo_next_node(root, node);
916 continue;
917 }
918 nchildren++;
919 node = topo_next_nonchild_node(root, node);
920 }
921
922 /*
923 * We are not interested in nodes including only one CPU each.
924 */
925 if (nchildren == root->cpu_count)
926 return;
927
928 /*
929 * We are not interested in nodes without children.
930 */
931 cg_root->cg_children = nchildren;
932 if (nchildren == 0)
933 return;
934
935 cg_root->cg_child = smp_topo_alloc(nchildren);
936
937 /*
938 * Now find again the same cache nodes as above and recursively
939 * build scheduling topologies for them.
940 */
941 node = root;
942 i = 0;
943 while (node != NULL) {
944 if ((node->type != TOPO_TYPE_GROUP &&
945 node->type != TOPO_TYPE_NODE &&
946 node->type != TOPO_TYPE_CACHE) ||
947 CPU_CMP(&node->cpuset, &root->cpuset) == 0 ||
948 CPU_EMPTY(&node->cpuset)) {
949 node = topo_next_node(root, node);
950 continue;
951 }
952 cg_root->cg_child[i].cg_parent = cg_root;
953 x86topo_add_sched_group(node, &cg_root->cg_child[i]);
954 i++;
955 node = topo_next_nonchild_node(root, node);
956 }
957 }
958
959 /*
960 * Build the MI scheduling topology from the discovered hardware topology.
961 */
962 struct cpu_group *
cpu_topo(void)963 cpu_topo(void)
964 {
965 struct cpu_group *cg_root;
966
967 if (mp_ncpus <= 1)
968 return (smp_topo_none());
969
970 cg_root = smp_topo_alloc(1);
971 x86topo_add_sched_group(&topo_root, cg_root);
972 return (cg_root);
973 }
974
975 static void
cpu_alloc(void * dummy __unused)976 cpu_alloc(void *dummy __unused)
977 {
978 /*
979 * Dynamically allocate the arrays that depend on the
980 * maximum APIC ID.
981 */
982 cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS,
983 M_WAITOK | M_ZERO);
984 apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS,
985 M_WAITOK | M_ZERO);
986 }
987 SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL);
988
989 /*
990 * Add a logical CPU to the topology.
991 */
992 void
cpu_add(u_int apic_id,char boot_cpu)993 cpu_add(u_int apic_id, char boot_cpu)
994 {
995
996 if (apic_id > max_apic_id)
997 panic("SMP: APIC ID %d too high", apic_id);
998
999 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice",
1000 apic_id));
1001 cpu_info[apic_id].cpu_present = 1;
1002 if (boot_cpu) {
1003 KASSERT(boot_cpu_id == -1,
1004 ("CPU %u claims to be BSP, but CPU %u already is", apic_id,
1005 boot_cpu_id));
1006 boot_cpu_id = apic_id;
1007 cpu_info[apic_id].cpu_bsp = 1;
1008 }
1009 if (bootverbose)
1010 printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" :
1011 "AP");
1012 }
1013
1014 void
cpu_mp_setmaxid(void)1015 cpu_mp_setmaxid(void)
1016 {
1017
1018 /*
1019 * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
1020 * If there were no calls to cpu_add() assume this is a UP system.
1021 */
1022 if (mp_ncpus == 0)
1023 mp_ncpus = 1;
1024 }
1025
1026 int
cpu_mp_probe(void)1027 cpu_mp_probe(void)
1028 {
1029
1030 /*
1031 * Always record BSP in CPU map so that the mbuf init code works
1032 * correctly.
1033 */
1034 CPU_SETOF(0, &all_cpus);
1035 return (mp_ncpus > 1);
1036 }
1037
1038 /*
1039 * AP CPU's call this to initialize themselves.
1040 */
1041 void
init_secondary_tail(void)1042 init_secondary_tail(void)
1043 {
1044 u_int cpuid;
1045
1046 pmap_activate_boot(vmspace_pmap(proc0.p_vmspace));
1047
1048 /*
1049 * On real hardware, switch to x2apic mode if possible. Do it
1050 * after aps_ready was signalled, to avoid manipulating the
1051 * mode while BSP might still want to send some IPI to us
1052 * (second startup IPI is ignored on modern hardware etc).
1053 */
1054 lapic_xapic_mode();
1055
1056 /* Initialize the PAT MSR. */
1057 pmap_init_pat();
1058
1059 /* set up CPU registers and state */
1060 cpu_setregs();
1061
1062 /* set up SSE/NX */
1063 initializecpu();
1064
1065 /* set up FPU state on the AP */
1066 #ifdef __amd64__
1067 fpuinit();
1068 #else
1069 npxinit(false);
1070 #endif
1071
1072 if (cpu_ops.cpu_init)
1073 cpu_ops.cpu_init();
1074
1075 /* A quick check from sanity claus */
1076 cpuid = PCPU_GET(cpuid);
1077 if (PCPU_GET(apic_id) != lapic_id()) {
1078 printf("SMP: cpuid = %d\n", cpuid);
1079 printf("SMP: actual apic_id = %d\n", lapic_id());
1080 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
1081 panic("cpuid mismatch! boom!!");
1082 }
1083
1084 /* Initialize curthread. */
1085 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
1086 PCPU_SET(curthread, PCPU_GET(idlethread));
1087 schedinit_ap();
1088
1089 mtx_lock_spin(&ap_boot_mtx);
1090
1091 mca_init();
1092
1093 /* Init local apic for irq's */
1094 lapic_setup(1);
1095
1096 /* Set memory range attributes for this CPU to match the BSP */
1097 mem_range_AP_init();
1098
1099 smp_cpus++;
1100
1101 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
1102 if (bootverbose)
1103 printf("SMP: AP CPU #%d Launched!\n", cpuid);
1104 else
1105 printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "",
1106 cpuid, smp_cpus == mp_ncpus ? "\n" : " ");
1107
1108 /* Determine if we are a logical CPU. */
1109 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
1110 CPU_SET(cpuid, &logical_cpus_mask);
1111
1112 if (bootverbose)
1113 lapic_dump("AP");
1114
1115 if (smp_cpus == mp_ncpus) {
1116 /* enable IPI's, tlb shootdown, freezes etc */
1117 atomic_store_rel_int(&smp_started, 1);
1118 }
1119
1120 #ifdef __amd64__
1121 if (pmap_pcid_enabled)
1122 load_cr4(rcr4() | CR4_PCIDE);
1123 load_ds(_udatasel);
1124 load_es(_udatasel);
1125 load_fs(_ufssel);
1126 #endif
1127
1128 mtx_unlock_spin(&ap_boot_mtx);
1129
1130 /* Wait until all the AP's are up. */
1131 while (atomic_load_acq_int(&smp_started) == 0)
1132 ia32_pause();
1133
1134 kcsan_cpu_init(cpuid);
1135
1136 sched_ap_entry();
1137
1138 panic("scheduler returned us to %s", __func__);
1139 /* NOTREACHED */
1140 }
1141
1142 static void
smp_after_idle_runnable(void * arg __unused)1143 smp_after_idle_runnable(void *arg __unused)
1144 {
1145 int cpu;
1146
1147 if (mp_ncpus == 1)
1148 return;
1149
1150 KASSERT(smp_started != 0, ("%s: SMP not started yet", __func__));
1151
1152 /*
1153 * Wait for all APs to handle an interrupt. After that, we know that
1154 * the APs have entered the scheduler at least once, so the boot stacks
1155 * are safe to free.
1156 */
1157 smp_rendezvous(smp_no_rendezvous_barrier, NULL,
1158 smp_no_rendezvous_barrier, NULL);
1159
1160 for (cpu = 1; cpu < mp_ncpus; cpu++) {
1161 kmem_free(bootstacks[cpu], kstack_pages * PAGE_SIZE);
1162 }
1163 }
1164 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY,
1165 smp_after_idle_runnable, NULL);
1166
1167 /*
1168 * We tell the I/O APIC code about all the CPUs we want to receive
1169 * interrupts. If we don't want certain CPUs to receive IRQs we
1170 * can simply not tell the I/O APIC code about them in this function.
1171 * We also do not tell it about the BSP since it tells itself about
1172 * the BSP internally to work with UP kernels and on UP machines.
1173 */
1174 void
set_interrupt_apic_ids(void)1175 set_interrupt_apic_ids(void)
1176 {
1177 u_int i, apic_id;
1178
1179 for (i = 0; i < MAXCPU; i++) {
1180 apic_id = cpu_apic_ids[i];
1181 if (apic_id == -1)
1182 continue;
1183 if (cpu_info[apic_id].cpu_bsp)
1184 continue;
1185 if (cpu_info[apic_id].cpu_disabled)
1186 continue;
1187 if (intr_apic_id_limit >= 0 && apic_id > intr_apic_id_limit)
1188 continue;
1189
1190 /* Don't let hyperthreads service interrupts. */
1191 if (cpu_info[apic_id].cpu_hyperthread &&
1192 !hyperthreading_intr_allowed)
1193 continue;
1194
1195 intr_add_cpu(i);
1196 }
1197 }
1198
1199 #ifdef COUNT_XINVLTLB_HITS
1200 u_int xhits_gbl[MAXCPU];
1201 u_int xhits_pg[MAXCPU];
1202 u_int xhits_rng[MAXCPU];
1203 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1204 "");
1205 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
1206 sizeof(xhits_gbl), "IU", "");
1207 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
1208 sizeof(xhits_pg), "IU", "");
1209 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
1210 sizeof(xhits_rng), "IU", "");
1211
1212 u_int ipi_global;
1213 u_int ipi_page;
1214 u_int ipi_range;
1215 u_int ipi_range_size;
1216 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
1217 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
1218 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
1219 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
1220 0, "");
1221 #endif /* COUNT_XINVLTLB_HITS */
1222
1223 /*
1224 * Init and startup IPI.
1225 */
1226 void
ipi_startup(int apic_id,int vector)1227 ipi_startup(int apic_id, int vector)
1228 {
1229
1230 /*
1231 * This attempts to follow the algorithm described in the
1232 * Intel Multiprocessor Specification v1.4 in section B.4.
1233 * For each IPI, we allow the local APIC ~20us to deliver the
1234 * IPI. If that times out, we panic.
1235 */
1236
1237 /*
1238 * first we do an INIT IPI: this INIT IPI might be run, resetting
1239 * and running the target CPU. OR this INIT IPI might be latched (P5
1240 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
1241 * ignored.
1242 */
1243 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
1244 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
1245 lapic_ipi_wait(100);
1246
1247 /* Explicitly deassert the INIT IPI. */
1248 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
1249 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
1250 apic_id);
1251
1252 DELAY(10000); /* wait ~10mS */
1253
1254 /*
1255 * next we do a STARTUP IPI: the previous INIT IPI might still be
1256 * latched, (P5 bug) this 1st STARTUP would then terminate
1257 * immediately, and the previously started INIT IPI would continue. OR
1258 * the previous INIT IPI has already run. and this STARTUP IPI will
1259 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
1260 * will run.
1261 */
1262 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1263 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1264 vector, apic_id);
1265 if (!lapic_ipi_wait(100))
1266 panic("Failed to deliver first STARTUP IPI to APIC %d",
1267 apic_id);
1268 DELAY(200); /* wait ~200uS */
1269
1270 /*
1271 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
1272 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
1273 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
1274 * recognized after hardware RESET or INIT IPI.
1275 */
1276 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1277 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1278 vector, apic_id);
1279 if (!lapic_ipi_wait(100))
1280 panic("Failed to deliver second STARTUP IPI to APIC %d",
1281 apic_id);
1282
1283 DELAY(200); /* wait ~200uS */
1284 }
1285
1286 static bool
ipi_bitmap_set(int cpu,u_int ipi)1287 ipi_bitmap_set(int cpu, u_int ipi)
1288 {
1289 u_int bitmap, old, new;
1290 u_int *cpu_bitmap;
1291
1292 bitmap = 1 << ipi;
1293 cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap;
1294 old = *cpu_bitmap;
1295 for (;;) {
1296 if ((old & bitmap) != 0)
1297 break;
1298 new = old | bitmap;
1299 if (atomic_fcmpset_int(cpu_bitmap, &old, new))
1300 break;
1301 }
1302 return (old != 0);
1303 }
1304
1305 /*
1306 * Send an IPI to specified CPU handling the bitmap logic.
1307 */
1308 static void
ipi_send_cpu(int cpu,u_int ipi)1309 ipi_send_cpu(int cpu, u_int ipi)
1310 {
1311
1312 KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1,
1313 ("IPI to non-existent CPU %d", cpu));
1314
1315 if (IPI_IS_BITMAPED(ipi)) {
1316 if (ipi_bitmap_set(cpu, ipi))
1317 return;
1318 ipi = IPI_BITMAP_VECTOR;
1319 }
1320 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1321 }
1322
1323 void
ipi_bitmap_handler(struct trapframe frame)1324 ipi_bitmap_handler(struct trapframe frame)
1325 {
1326 struct trapframe *oldframe;
1327 struct thread *td;
1328 int cpu = PCPU_GET(cpuid);
1329 u_int ipi_bitmap;
1330
1331 kasan_mark(&frame, sizeof(frame), sizeof(frame), 0);
1332
1333 td = curthread;
1334 ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]->
1335 pc_ipi_bitmap);
1336
1337 /*
1338 * sched_preempt() must be called to clear the pending preempt
1339 * IPI to enable delivery of further preempts. However, the
1340 * critical section will cause extra scheduler lock thrashing
1341 * when used unconditionally. Only critical_enter() if
1342 * hardclock must also run, which requires the section entry.
1343 */
1344 if (ipi_bitmap & (1 << IPI_HARDCLOCK))
1345 critical_enter();
1346
1347 td->td_intr_nesting_level++;
1348 oldframe = td->td_intr_frame;
1349 td->td_intr_frame = &frame;
1350 #if defined(STACK) || defined(DDB)
1351 if (ipi_bitmap & (1 << IPI_TRACE))
1352 stack_capture_intr();
1353 #endif
1354 if (ipi_bitmap & (1 << IPI_PREEMPT)) {
1355 #ifdef COUNT_IPIS
1356 (*ipi_preempt_counts[cpu])++;
1357 #endif
1358 sched_preempt(td);
1359 }
1360 if (ipi_bitmap & (1 << IPI_AST)) {
1361 #ifdef COUNT_IPIS
1362 (*ipi_ast_counts[cpu])++;
1363 #endif
1364 /* Nothing to do for AST */
1365 }
1366 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
1367 #ifdef COUNT_IPIS
1368 (*ipi_hardclock_counts[cpu])++;
1369 #endif
1370 hardclockintr();
1371 }
1372 td->td_intr_frame = oldframe;
1373 td->td_intr_nesting_level--;
1374 if (ipi_bitmap & (1 << IPI_HARDCLOCK))
1375 critical_exit();
1376 }
1377
1378 /*
1379 * send an IPI to a set of cpus.
1380 */
1381 void
ipi_selected(cpuset_t cpus,u_int ipi)1382 ipi_selected(cpuset_t cpus, u_int ipi)
1383 {
1384 int cpu;
1385
1386 /*
1387 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1388 * of help in order to understand what is the source.
1389 * Set the mask of receiving CPUs for this purpose.
1390 */
1391 if (ipi == IPI_STOP_HARD)
1392 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
1393
1394 CPU_FOREACH_ISSET(cpu, &cpus) {
1395 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1396 ipi_send_cpu(cpu, ipi);
1397 }
1398 }
1399
1400 /*
1401 * send an IPI to a specific CPU.
1402 */
1403 void
ipi_cpu(int cpu,u_int ipi)1404 ipi_cpu(int cpu, u_int ipi)
1405 {
1406
1407 /*
1408 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1409 * of help in order to understand what is the source.
1410 * Set the mask of receiving CPUs for this purpose.
1411 */
1412 if (ipi == IPI_STOP_HARD)
1413 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
1414
1415 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1416 ipi_send_cpu(cpu, ipi);
1417 }
1418
1419 /*
1420 * send an IPI to all CPUs EXCEPT myself
1421 */
1422 void
ipi_all_but_self(u_int ipi)1423 ipi_all_but_self(u_int ipi)
1424 {
1425 cpuset_t other_cpus;
1426 int cpu, c;
1427
1428 /*
1429 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1430 * of help in order to understand what is the source.
1431 * Set the mask of receiving CPUs for this purpose.
1432 */
1433 if (ipi == IPI_STOP_HARD) {
1434 other_cpus = all_cpus;
1435 CPU_CLR(PCPU_GET(cpuid), &other_cpus);
1436 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
1437 }
1438
1439 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1440 if (IPI_IS_BITMAPED(ipi)) {
1441 cpu = PCPU_GET(cpuid);
1442 CPU_FOREACH(c) {
1443 if (c != cpu)
1444 ipi_bitmap_set(c, ipi);
1445 }
1446 ipi = IPI_BITMAP_VECTOR;
1447 }
1448 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1449 }
1450
1451 void
ipi_self_from_nmi(u_int vector)1452 ipi_self_from_nmi(u_int vector)
1453 {
1454
1455 lapic_ipi_vectored(vector, APIC_IPI_DEST_SELF);
1456
1457 /* Wait for IPI to finish. */
1458 if (!lapic_ipi_wait(50000)) {
1459 if (KERNEL_PANICKED())
1460 return;
1461 else
1462 panic("APIC: IPI is stuck");
1463 }
1464 }
1465
1466 int
ipi_nmi_handler(void)1467 ipi_nmi_handler(void)
1468 {
1469 u_int cpuid;
1470
1471 /*
1472 * As long as there is not a simple way to know about a NMI's
1473 * source, if the bitmask for the current CPU is present in
1474 * the global pending bitword an IPI_STOP_HARD has been issued
1475 * and should be handled.
1476 */
1477 cpuid = PCPU_GET(cpuid);
1478 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
1479 return (1);
1480
1481 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
1482 cpustop_handler();
1483 return (0);
1484 }
1485
1486 int nmi_kdb_lock;
1487
1488 void
nmi_call_kdb_smp(u_int type,struct trapframe * frame)1489 nmi_call_kdb_smp(u_int type, struct trapframe *frame)
1490 {
1491 int cpu;
1492 bool call_post;
1493
1494 cpu = PCPU_GET(cpuid);
1495 if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
1496 nmi_call_kdb(cpu, type, frame);
1497 call_post = false;
1498 } else {
1499 savectx(&stoppcbs[cpu]);
1500 CPU_SET_ATOMIC(cpu, &stopped_cpus);
1501 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
1502 ia32_pause();
1503 call_post = true;
1504 }
1505 atomic_store_rel_int(&nmi_kdb_lock, 0);
1506 if (call_post)
1507 cpustop_handler_post(cpu);
1508 }
1509
1510 /*
1511 * Handle an IPI_STOP by saving our current context and spinning (or mwaiting,
1512 * if available) until we are resumed.
1513 */
1514 void
cpustop_handler(void)1515 cpustop_handler(void)
1516 {
1517 struct monitorbuf *mb;
1518 u_int cpu;
1519 bool use_mwait;
1520
1521 cpu = PCPU_GET(cpuid);
1522
1523 savectx(&stoppcbs[cpu]);
1524
1525 use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 &&
1526 !mwait_cpustop_broken);
1527 if (use_mwait) {
1528 mb = PCPU_PTR(monitorbuf);
1529 atomic_store_int(&mb->stop_state,
1530 MONITOR_STOPSTATE_STOPPED);
1531 }
1532
1533 /* Indicate that we are stopped */
1534 CPU_SET_ATOMIC(cpu, &stopped_cpus);
1535
1536 /* Wait for restart */
1537 while (!CPU_ISSET(cpu, &started_cpus)) {
1538 if (use_mwait) {
1539 cpu_monitor(mb, 0, 0);
1540 if (atomic_load_int(&mb->stop_state) ==
1541 MONITOR_STOPSTATE_STOPPED)
1542 cpu_mwait(0, MWAIT_C1);
1543 continue;
1544 }
1545
1546 ia32_pause();
1547
1548 /*
1549 * Halt non-BSP CPUs on panic -- we're never going to need them
1550 * again, and might as well save power / release resources
1551 * (e.g., overprovisioned VM infrastructure).
1552 */
1553 while (__predict_false(!IS_BSP() && KERNEL_PANICKED()))
1554 halt();
1555 }
1556
1557 cpustop_handler_post(cpu);
1558 }
1559
1560 static void
cpustop_handler_post(u_int cpu)1561 cpustop_handler_post(u_int cpu)
1562 {
1563
1564 CPU_CLR_ATOMIC(cpu, &started_cpus);
1565 CPU_CLR_ATOMIC(cpu, &stopped_cpus);
1566
1567 /*
1568 * We don't broadcast TLB invalidations to other CPUs when they are
1569 * stopped. Hence, we clear the TLB before resuming.
1570 */
1571 invltlb_glob();
1572
1573 #if defined(__amd64__) && (defined(DDB) || defined(GDB))
1574 amd64_db_resume_dbreg();
1575 #endif
1576
1577 if (cpu == 0 && cpustop_restartfunc != NULL) {
1578 cpustop_restartfunc();
1579 cpustop_restartfunc = NULL;
1580 }
1581 }
1582
1583 /*
1584 * Handle an IPI_SUSPEND by saving our current context and spinning until we
1585 * are resumed.
1586 */
1587 void
cpususpend_handler(void)1588 cpususpend_handler(void)
1589 {
1590 u_int cpu;
1591
1592 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
1593
1594 cpu = PCPU_GET(cpuid);
1595
1596 #ifdef XENHVM
1597 /*
1598 * Some Xen guest types (PVH) expose a very minimal set of ACPI tables,
1599 * and for example have no support for SCI. That leads to the suspend
1600 * stacks not being allocated, and hence when attempting to perform a
1601 * Xen triggered suspension FreeBSD will hit a #PF. Avoid saving the
1602 * CPU and FPU contexts if the stacks are not allocated, as the
1603 * hypervisor will already take care of this. Note that we could even
1604 * do this for Xen triggered suspensions on guests that have full ACPI
1605 * support, but doing so would introduce extra complexity.
1606 */
1607 if (susppcbs == NULL) {
1608 KASSERT(vm_guest == VM_GUEST_XEN, ("Missing suspend stack"));
1609 CPU_SET_ATOMIC(cpu, &suspended_cpus);
1610 CPU_SET_ATOMIC(cpu, &resuming_cpus);
1611 } else
1612 #endif
1613 if (savectx(&susppcbs[cpu]->sp_pcb)) {
1614 #ifdef __amd64__
1615 fpususpend(susppcbs[cpu]->sp_fpususpend);
1616 #else
1617 npxsuspend(susppcbs[cpu]->sp_fpususpend);
1618 #endif
1619 /*
1620 * suspended_cpus is cleared shortly after each AP is restarted
1621 * by a Startup IPI, so that the BSP can proceed to restarting
1622 * the next AP.
1623 *
1624 * resuming_cpus gets cleared when the AP completes
1625 * initialization after having been released by the BSP.
1626 * resuming_cpus is probably not the best name for the
1627 * variable, because it is actually a set of processors that
1628 * haven't resumed yet and haven't necessarily started resuming.
1629 *
1630 * Note that suspended_cpus is meaningful only for ACPI suspend
1631 * as it's not really used for Xen suspend since the APs are
1632 * automatically restored to the running state and the correct
1633 * context. For the same reason resumectx is never called in
1634 * that case.
1635 */
1636 CPU_SET_ATOMIC(cpu, &suspended_cpus);
1637 CPU_SET_ATOMIC(cpu, &resuming_cpus);
1638
1639 /*
1640 * Invalidate the cache after setting the global status bits.
1641 * The last AP to set its bit may end up being an Owner of the
1642 * corresponding cache line in MOESI protocol. The AP may be
1643 * stopped before the cache line is written to the main memory.
1644 */
1645 wbinvd();
1646 } else {
1647 #ifdef __amd64__
1648 fpuresume(susppcbs[cpu]->sp_fpususpend);
1649 #else
1650 npxresume(susppcbs[cpu]->sp_fpususpend);
1651 #endif
1652 pmap_init_pat();
1653 initializecpu();
1654 PCPU_SET(switchtime, 0);
1655 PCPU_SET(switchticks, ticks);
1656
1657 /* Indicate that we have restarted and restored the context. */
1658 CPU_CLR_ATOMIC(cpu, &suspended_cpus);
1659 }
1660
1661 /* Wait for resume directive */
1662 while (!CPU_ISSET(cpu, &toresume_cpus))
1663 ia32_pause();
1664
1665 /* Re-apply microcode updates. */
1666 ucode_reload();
1667
1668 #ifdef __i386__
1669 /* Finish removing the identity mapping of low memory for this AP. */
1670 invltlb_glob();
1671 #endif
1672
1673 if (cpu_ops.cpu_resume)
1674 cpu_ops.cpu_resume();
1675 #ifdef __amd64__
1676 if (vmm_resume_p)
1677 vmm_resume_p();
1678 #endif
1679
1680 /* Resume MCA and local APIC */
1681 lapic_xapic_mode();
1682 mca_resume();
1683 lapic_setup(0);
1684
1685 /* Indicate that we are resumed */
1686 CPU_CLR_ATOMIC(cpu, &resuming_cpus);
1687 CPU_CLR_ATOMIC(cpu, &suspended_cpus);
1688 CPU_CLR_ATOMIC(cpu, &toresume_cpus);
1689 }
1690
1691 /*
1692 * Handle an IPI_SWI by waking delayed SWI thread.
1693 */
1694 void
ipi_swi_handler(struct trapframe frame)1695 ipi_swi_handler(struct trapframe frame)
1696 {
1697
1698 intr_event_handle(clk_intr_event, &frame);
1699 }
1700
1701 /*
1702 * This is called once the rest of the system is up and running and we're
1703 * ready to let the AP's out of the pen.
1704 */
1705 static void
release_aps(void * dummy __unused)1706 release_aps(void *dummy __unused)
1707 {
1708
1709 if (mp_ncpus == 1)
1710 return;
1711 atomic_store_rel_int(&aps_ready, 1);
1712 while (smp_started == 0)
1713 ia32_pause();
1714 }
1715 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1716
1717 #ifdef COUNT_IPIS
1718 /*
1719 * Setup interrupt counters for IPI handlers.
1720 */
1721 static void
mp_ipi_intrcnt(void * dummy)1722 mp_ipi_intrcnt(void *dummy)
1723 {
1724 char buf[64];
1725 int i;
1726
1727 CPU_FOREACH(i) {
1728 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
1729 intrcnt_add(buf, &ipi_invltlb_counts[i]);
1730 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
1731 intrcnt_add(buf, &ipi_invlrng_counts[i]);
1732 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
1733 intrcnt_add(buf, &ipi_invlpg_counts[i]);
1734 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
1735 intrcnt_add(buf, &ipi_invlcache_counts[i]);
1736 snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
1737 intrcnt_add(buf, &ipi_preempt_counts[i]);
1738 snprintf(buf, sizeof(buf), "cpu%d:ast", i);
1739 intrcnt_add(buf, &ipi_ast_counts[i]);
1740 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
1741 intrcnt_add(buf, &ipi_rendezvous_counts[i]);
1742 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
1743 intrcnt_add(buf, &ipi_hardclock_counts[i]);
1744 }
1745 }
1746 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
1747 #endif
1748