1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * CPU/APIC topology
4 *
5 * The APIC IDs describe the system topology in multiple domain levels.
6 * The CPUID topology parser provides the information which part of the
7 * APIC ID is associated to the individual levels:
8 *
9 * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD]
10 *
11 * The root space contains the package (socket) IDs.
12 *
13 * Not enumerated levels consume 0 bits space, but conceptually they are
14 * always represented. If e.g. only CORE and THREAD levels are enumerated
15 * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE.
16 *
17 * If SMT is not supported, then the THREAD domain is still used. It then
18 * has the same physical ID as the CORE domain and is the only child of
19 * the core domain.
20 *
21 * This allows a unified view on the system independent of the enumerated
22 * domain levels without requiring any conditionals in the code.
23 */
24 #define pr_fmt(fmt) "CPU topo: " fmt
25 #include <linux/cpu.h>
26
27 #include <xen/xen.h>
28
29 #include <asm/apic.h>
30 #include <asm/io_apic.h>
31 #include <asm/mpspec.h>
32 #include <asm/msr.h>
33 #include <asm/smp.h>
34 #include <asm/numa.h>
35
36 #include "cpu.h"
37
38 /*
39 * Map cpu index to physical APIC ID
40 */
41 DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
42 DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID);
43 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
44 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
45
46 /* Bitmap of physically present CPUs. */
47 DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly;
48
49 /* Used for CPU number allocation and parallel CPU bringup */
50 u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, };
51
52 /* Bitmaps to mark registered APICs at each topology domain */
53 static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init;
54
55 /*
56 * Keep track of assigned, disabled and rejected CPUs. Present assigned
57 * with 1 as CPU #0 is reserved for the boot CPU.
58 */
59 static struct {
60 unsigned int nr_assigned_cpus;
61 unsigned int nr_disabled_cpus;
62 unsigned int nr_rejected_cpus;
63 u32 boot_cpu_apic_id;
64 u32 real_bsp_apic_id;
65 } topo_info __ro_after_init = {
66 .nr_assigned_cpus = 1,
67 .boot_cpu_apic_id = BAD_APICID,
68 .real_bsp_apic_id = BAD_APICID,
69 };
70
71 #define domain_weight(_dom) bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC)
72
arch_match_cpu_phys_id(int cpu,u64 phys_id)73 bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
74 {
75 return phys_id == (u64)cpuid_to_apicid[cpu];
76 }
77
cpu_mark_primary_thread(unsigned int cpu,unsigned int apicid)78 static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
79 {
80 if (!(apicid & (__max_threads_per_core - 1)))
81 cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
82 }
83
84 /*
85 * Convert the APIC ID to a domain level ID by masking out the low bits
86 * below the domain level @dom.
87 */
topo_apicid(u32 apicid,enum x86_topology_domains dom)88 static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom)
89 {
90 if (dom == TOPO_SMT_DOMAIN)
91 return apicid;
92 return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]);
93 }
94
topo_lookup_cpuid(u32 apic_id)95 static int topo_lookup_cpuid(u32 apic_id)
96 {
97 int i;
98
99 /* CPU# to APICID mapping is persistent once it is established */
100 for (i = 0; i < topo_info.nr_assigned_cpus; i++) {
101 if (cpuid_to_apicid[i] == apic_id)
102 return i;
103 }
104 return -ENODEV;
105 }
106
topo_get_cpunr(u32 apic_id)107 static __init int topo_get_cpunr(u32 apic_id)
108 {
109 int cpu = topo_lookup_cpuid(apic_id);
110
111 if (cpu >= 0)
112 return cpu;
113
114 return topo_info.nr_assigned_cpus++;
115 }
116
topo_set_cpuids(unsigned int cpu,u32 apic_id,u32 acpi_id)117 static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id)
118 {
119 #if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
120 early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id;
121 early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id;
122 #endif
123 set_cpu_present(cpu, true);
124 }
125
check_for_real_bsp(u32 apic_id)126 static __init bool check_for_real_bsp(u32 apic_id)
127 {
128 bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6;
129 u64 msr;
130
131 /*
132 * There is no real good way to detect whether this a kdump()
133 * kernel, but except on the Voyager SMP monstrosity which is not
134 * longer supported, the real BSP APIC ID is the first one which is
135 * enumerated by firmware. That allows to detect whether the boot
136 * CPU is the real BSP. If it is not, then do not register the APIC
137 * because sending INIT to the real BSP would reset the whole
138 * system.
139 *
140 * The first APIC ID which is enumerated by firmware is detectable
141 * because the boot CPU APIC ID is registered before that without
142 * invoking this code.
143 */
144 if (topo_info.real_bsp_apic_id != BAD_APICID)
145 return false;
146
147 /*
148 * Check whether the enumeration order is broken by evaluating the
149 * BSP bit in the APICBASE MSR. If the CPU does not have the
150 * APICBASE MSR then the BSP detection is not possible and the
151 * kernel must rely on the firmware enumeration order.
152 */
153 if (has_apic_base) {
154 rdmsrq(MSR_IA32_APICBASE, msr);
155 is_bsp = !!(msr & MSR_IA32_APICBASE_BSP);
156 }
157
158 if (apic_id == topo_info.boot_cpu_apic_id) {
159 /*
160 * If the boot CPU has the APIC BSP bit set then the
161 * firmware enumeration is agreeing. If the CPU does not
162 * have the APICBASE MSR then the only choice is to trust
163 * the enumeration order.
164 */
165 if (is_bsp || !has_apic_base) {
166 topo_info.real_bsp_apic_id = apic_id;
167 return false;
168 }
169 /*
170 * If the boot APIC is enumerated first, but the APICBASE
171 * MSR does not have the BSP bit set, then there is no way
172 * to discover the real BSP here. Assume a crash kernel and
173 * limit the number of CPUs to 1 as an INIT to the real BSP
174 * would reset the machine.
175 */
176 pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id);
177 pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n");
178 set_nr_cpu_ids(1);
179 goto fwbug;
180 }
181
182 pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n",
183 topo_info.boot_cpu_apic_id, apic_id);
184
185 if (is_bsp) {
186 /*
187 * The boot CPU has the APIC BSP bit set. Use it and complain
188 * about the broken firmware enumeration.
189 */
190 topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id;
191 goto fwbug;
192 }
193
194 pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n");
195
196 topo_info.real_bsp_apic_id = apic_id;
197 return true;
198
199 fwbug:
200 pr_warn(FW_BUG "APIC enumeration order not specification compliant\n");
201 return false;
202 }
203
topo_unit_count(u32 lvlid,enum x86_topology_domains at_level,unsigned long * map)204 static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level,
205 unsigned long *map)
206 {
207 unsigned int id, end, cnt = 0;
208
209 /* Calculate the exclusive end */
210 end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]);
211
212 /* Unfortunately there is no bitmap_weight_range() */
213 for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id))
214 cnt++;
215 return cnt;
216 }
217
topo_register_apic(u32 apic_id,u32 acpi_id,bool present)218 static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
219 {
220 int cpu, dom;
221
222 if (present) {
223 set_bit(apic_id, phys_cpu_present_map);
224
225 /*
226 * Double registration is valid in case of the boot CPU
227 * APIC because that is registered before the enumeration
228 * of the APICs via firmware parsers or VM guest
229 * mechanisms.
230 */
231 if (apic_id == topo_info.boot_cpu_apic_id)
232 cpu = 0;
233 else
234 cpu = topo_get_cpunr(apic_id);
235
236 cpuid_to_apicid[cpu] = apic_id;
237 topo_set_cpuids(cpu, apic_id, acpi_id);
238 } else {
239 topo_info.nr_disabled_cpus++;
240 }
241
242 /*
243 * Register present and possible CPUs in the domain
244 * maps. cpu_possible_map will be updated in
245 * topology_init_possible_cpus() after enumeration is done.
246 */
247 for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++)
248 set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map);
249 }
250
251 /**
252 * topology_register_apic - Register an APIC in early topology maps
253 * @apic_id: The APIC ID to set up
254 * @acpi_id: The ACPI ID associated to the APIC
255 * @present: True if the corresponding CPU is present
256 */
topology_register_apic(u32 apic_id,u32 acpi_id,bool present)257 void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present)
258 {
259 if (apic_id >= MAX_LOCAL_APIC) {
260 pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1);
261 topo_info.nr_rejected_cpus++;
262 return;
263 }
264
265 if (check_for_real_bsp(apic_id)) {
266 topo_info.nr_rejected_cpus++;
267 return;
268 }
269
270 /* CPU numbers exhausted? */
271 if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) {
272 pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids);
273 topo_info.nr_rejected_cpus++;
274 return;
275 }
276
277 topo_register_apic(apic_id, acpi_id, present);
278 }
279
280 /**
281 * topology_register_boot_apic - Register the boot CPU APIC
282 * @apic_id: The APIC ID to set up
283 *
284 * Separate so CPU #0 can be assigned
285 */
topology_register_boot_apic(u32 apic_id)286 void __init topology_register_boot_apic(u32 apic_id)
287 {
288 WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID);
289
290 topo_info.boot_cpu_apic_id = apic_id;
291 topo_register_apic(apic_id, CPU_ACPIID_INVALID, true);
292 }
293
294 /**
295 * topology_get_logical_id - Retrieve the logical ID at a given topology domain level
296 * @apicid: The APIC ID for which to lookup the logical ID
297 * @at_level: The topology domain level to use
298 *
299 * @apicid must be a full APIC ID, not the normalized variant. It's valid to have
300 * all bits below the domain level specified by @at_level to be clear. So both
301 * real APIC IDs and backshifted normalized APIC IDs work correctly.
302 *
303 * Returns:
304 * - >= 0: The requested logical ID
305 * - -ERANGE: @apicid is out of range
306 * - -ENODEV: @apicid is not registered
307 */
topology_get_logical_id(u32 apicid,enum x86_topology_domains at_level)308 int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level)
309 {
310 /* Remove the bits below @at_level to get the proper level ID of @apicid */
311 unsigned int lvlid = topo_apicid(apicid, at_level);
312
313 if (lvlid >= MAX_LOCAL_APIC)
314 return -ERANGE;
315 if (!test_bit(lvlid, apic_maps[at_level].map))
316 return -ENODEV;
317 /* Get the number of set bits before @lvlid. */
318 return bitmap_weight(apic_maps[at_level].map, lvlid);
319 }
320 EXPORT_SYMBOL_GPL(topology_get_logical_id);
321
322 /**
323 * topology_unit_count - Retrieve the count of specified units at a given topology domain level
324 * @apicid: The APIC ID which specifies the search range
325 * @which_units: The domain level specifying the units to count
326 * @at_level: The domain level at which @which_units have to be counted
327 *
328 * This returns the number of possible units according to the enumerated
329 * information.
330 *
331 * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN)
332 * counts the number of possible cores in the package to which @apicid
333 * belongs.
334 *
335 * @at_level must obviously be greater than @which_level to produce useful
336 * results. If @at_level is equal to @which_units the result is
337 * unsurprisingly 1. If @at_level is less than @which_units the results
338 * is by definition undefined and the function returns 0.
339 */
topology_unit_count(u32 apicid,enum x86_topology_domains which_units,enum x86_topology_domains at_level)340 unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
341 enum x86_topology_domains at_level)
342 {
343 /* Remove the bits below @at_level to get the proper level ID of @apicid */
344 unsigned int lvlid = topo_apicid(apicid, at_level);
345
346 if (lvlid >= MAX_LOCAL_APIC)
347 return 0;
348 if (!test_bit(lvlid, apic_maps[at_level].map))
349 return 0;
350 if (which_units > at_level)
351 return 0;
352 if (which_units == at_level)
353 return 1;
354 return topo_unit_count(lvlid, at_level, apic_maps[which_units].map);
355 }
356
357 #ifdef CONFIG_SMP
topology_get_primary_thread(unsigned int cpu)358 int topology_get_primary_thread(unsigned int cpu)
359 {
360 u32 apic_id = cpuid_to_apicid[cpu];
361
362 /*
363 * Get the core domain level APIC id, which is the primary thread
364 * and return the CPU number assigned to it.
365 */
366 return topo_lookup_cpuid(topo_apicid(apic_id, TOPO_CORE_DOMAIN));
367 }
368 #endif
369
370 #ifdef CONFIG_ACPI_HOTPLUG_CPU
371 /**
372 * topology_hotplug_apic - Handle a physical hotplugged APIC after boot
373 * @apic_id: The APIC ID to set up
374 * @acpi_id: The ACPI ID associated to the APIC
375 */
topology_hotplug_apic(u32 apic_id,u32 acpi_id)376 int topology_hotplug_apic(u32 apic_id, u32 acpi_id)
377 {
378 int cpu;
379
380 if (apic_id >= MAX_LOCAL_APIC)
381 return -EINVAL;
382
383 /* Reject if the APIC ID was not registered during enumeration. */
384 if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map))
385 return -ENODEV;
386
387 cpu = topo_lookup_cpuid(apic_id);
388 if (cpu < 0)
389 return -ENOSPC;
390
391 set_bit(apic_id, phys_cpu_present_map);
392 topo_set_cpuids(cpu, apic_id, acpi_id);
393 cpu_mark_primary_thread(cpu, apic_id);
394 return cpu;
395 }
396
397 /**
398 * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot
399 * @cpu: The CPU number for which the APIC ID is removed
400 */
topology_hotunplug_apic(unsigned int cpu)401 void topology_hotunplug_apic(unsigned int cpu)
402 {
403 u32 apic_id = cpuid_to_apicid[cpu];
404
405 if (apic_id == BAD_APICID)
406 return;
407
408 per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
409 clear_bit(apic_id, phys_cpu_present_map);
410 set_cpu_present(cpu, false);
411 }
412 #endif
413
414 #ifdef CONFIG_X86_LOCAL_APIC
415 static unsigned int max_possible_cpus __initdata = NR_CPUS;
416
417 /**
418 * topology_apply_cmdline_limits_early - Apply topology command line limits early
419 *
420 * Ensure that command line limits are in effect before firmware parsing
421 * takes place.
422 */
topology_apply_cmdline_limits_early(void)423 void __init topology_apply_cmdline_limits_early(void)
424 {
425 unsigned int possible = nr_cpu_ids;
426
427 /* 'maxcpus=0' 'nosmp' 'nolapic' */
428 if (!setup_max_cpus || apic_is_disabled)
429 possible = 1;
430
431 /* 'possible_cpus=N' */
432 possible = min_t(unsigned int, max_possible_cpus, possible);
433
434 if (possible < nr_cpu_ids) {
435 pr_info("Limiting to %u possible CPUs\n", possible);
436 set_nr_cpu_ids(possible);
437 }
438 }
439
restrict_to_up(void)440 static __init bool restrict_to_up(void)
441 {
442 if (!smp_found_config)
443 return true;
444 /*
445 * XEN PV is special as it does not advertise the local APIC
446 * properly, but provides a fake topology for it so that the
447 * infrastructure works. So don't apply the restrictions vs. APIC
448 * here.
449 */
450 if (xen_pv_domain())
451 return false;
452
453 return apic_is_disabled;
454 }
455
topology_init_possible_cpus(void)456 void __init topology_init_possible_cpus(void)
457 {
458 unsigned int assigned = topo_info.nr_assigned_cpus;
459 unsigned int disabled = topo_info.nr_disabled_cpus;
460 unsigned int cnta, cntb, cpu, allowed = 1;
461 unsigned int total = assigned + disabled;
462 u32 apicid, firstid;
463
464 /*
465 * If there was no APIC registered, then fake one so that the
466 * topology bitmap is populated. That ensures that the code below
467 * is valid and the various query interfaces can be used
468 * unconditionally. This does not affect the actual APIC code in
469 * any way because either the local APIC address has not been
470 * registered or the local APIC was disabled on the command line.
471 */
472 if (topo_info.boot_cpu_apic_id == BAD_APICID)
473 topology_register_boot_apic(0);
474
475 if (!restrict_to_up()) {
476 if (WARN_ON_ONCE(assigned > nr_cpu_ids)) {
477 disabled += assigned - nr_cpu_ids;
478 assigned = nr_cpu_ids;
479 }
480 allowed = min_t(unsigned int, total, nr_cpu_ids);
481 }
482
483 if (total > allowed)
484 pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed);
485
486 assigned = min_t(unsigned int, allowed, assigned);
487 disabled = allowed - assigned;
488
489 topo_info.nr_assigned_cpus = assigned;
490 topo_info.nr_disabled_cpus = disabled;
491
492 total_cpus = allowed;
493 set_nr_cpu_ids(allowed);
494
495 cnta = domain_weight(TOPO_PKG_DOMAIN);
496 __max_logical_packages = cnta;
497
498 pr_info("Max. logical packages: %3u\n", __max_logical_packages);
499
500 cntb = num_phys_nodes();
501 __num_nodes_per_package = DIV_ROUND_UP(cntb, cnta);
502
503 pr_info("Max. logical nodes: %3u\n", cntb);
504 pr_info("Num. nodes per package:%3u\n", __num_nodes_per_package);
505
506 cntb = domain_weight(TOPO_DIE_DOMAIN);
507 __max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));
508
509 pr_info("Max. logical dies: %3u\n", cntb);
510 pr_info("Max. dies per package: %3u\n", __max_dies_per_package);
511
512 cnta = domain_weight(TOPO_CORE_DOMAIN);
513 cntb = domain_weight(TOPO_SMT_DOMAIN);
514 /*
515 * Can't use order delta here as order(cnta) can be equal
516 * order(cntb) even if cnta != cntb.
517 */
518 __max_threads_per_core = DIV_ROUND_UP(cntb, cnta);
519 pr_info("Max. threads per core: %3u\n", __max_threads_per_core);
520
521 firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC);
522 __num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN);
523 pr_info("Num. cores per package: %3u\n", __num_cores_per_package);
524 __num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN);
525 pr_info("Num. threads per package: %3u\n", __num_threads_per_package);
526
527 pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled);
528 if (topo_info.nr_rejected_cpus)
529 pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus);
530
531 init_cpu_present(cpumask_of(0));
532 init_cpu_possible(cpumask_of(0));
533
534 /* Assign CPU numbers to non-present CPUs */
535 for (apicid = 0; disabled; disabled--, apicid++) {
536 apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map,
537 MAX_LOCAL_APIC, apicid);
538 if (apicid >= MAX_LOCAL_APIC)
539 break;
540 cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid;
541 }
542
543 for (cpu = 0; cpu < allowed; cpu++) {
544 apicid = cpuid_to_apicid[cpu];
545
546 set_cpu_possible(cpu, true);
547
548 if (apicid == BAD_APICID)
549 continue;
550
551 cpu_mark_primary_thread(cpu, apicid);
552 set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map));
553 }
554 }
555
556 /*
557 * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed.
558 */
topology_reset_possible_cpus_up(void)559 void __init topology_reset_possible_cpus_up(void)
560 {
561 init_cpu_present(cpumask_of(0));
562 init_cpu_possible(cpumask_of(0));
563
564 bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC);
565 if (topo_info.boot_cpu_apic_id != BAD_APICID)
566 set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map);
567 }
568
setup_possible_cpus(char * str)569 static int __init setup_possible_cpus(char *str)
570 {
571 get_option(&str, &max_possible_cpus);
572 return 0;
573 }
574 early_param("possible_cpus", setup_possible_cpus);
575 #endif
576