xref: /linux/arch/x86/kernel/cpu/topology.c (revision 6f7e6393d1ce636bb7ec77a7fe7b77458fddf701)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * CPU/APIC topology
4  *
5  * The APIC IDs describe the system topology in multiple domain levels.
6  * The CPUID topology parser provides the information which part of the
7  * APIC ID is associated to the individual levels:
8  *
9  * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD]
10  *
11  * The root space contains the package (socket) IDs.
12  *
13  * Not enumerated levels consume 0 bits space, but conceptually they are
14  * always represented. If e.g. only CORE and THREAD levels are enumerated
15  * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE.
16  *
17  * If SMT is not supported, then the THREAD domain is still used. It then
18  * has the same physical ID as the CORE domain and is the only child of
19  * the core domain.
20  *
21  * This allows a unified view on the system independent of the enumerated
22  * domain levels without requiring any conditionals in the code.
23  */
24 #define pr_fmt(fmt) "CPU topo: " fmt
25 #include <linux/cpu.h>
26 
27 #include <xen/xen.h>
28 
29 #include <asm/apic.h>
30 #include <asm/io_apic.h>
31 #include <asm/mpspec.h>
32 #include <asm/msr.h>
33 #include <asm/smp.h>
34 
35 #include "cpu.h"
36 
37 /*
38  * Map cpu index to physical APIC ID
39  */
40 DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
41 DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID);
42 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
43 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
44 
45 /* Bitmap of physically present CPUs. */
46 DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly;
47 
48 /* Used for CPU number allocation and parallel CPU bringup */
49 u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, };
50 
51 /* Bitmaps to mark registered APICs at each topology domain */
52 static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init;
53 
54 /*
55  * Keep track of assigned, disabled and rejected CPUs. Present assigned
56  * with 1 as CPU #0 is reserved for the boot CPU.
57  */
58 static struct {
59 	unsigned int		nr_assigned_cpus;
60 	unsigned int		nr_disabled_cpus;
61 	unsigned int		nr_rejected_cpus;
62 	u32			boot_cpu_apic_id;
63 	u32			real_bsp_apic_id;
64 } topo_info __ro_after_init = {
65 	.nr_assigned_cpus	= 1,
66 	.boot_cpu_apic_id	= BAD_APICID,
67 	.real_bsp_apic_id	= BAD_APICID,
68 };
69 
70 #define domain_weight(_dom)	bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC)
71 
72 bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
73 {
74 	return phys_id == (u64)cpuid_to_apicid[cpu];
75 }
76 
77 static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
78 {
79 	if (!(apicid & (__max_threads_per_core - 1)))
80 		cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
81 }
82 
83 /*
84  * Convert the APIC ID to a domain level ID by masking out the low bits
85  * below the domain level @dom.
86  */
87 static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom)
88 {
89 	if (dom == TOPO_SMT_DOMAIN)
90 		return apicid;
91 	return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]);
92 }
93 
94 static int topo_lookup_cpuid(u32 apic_id)
95 {
96 	int i;
97 
98 	/* CPU# to APICID mapping is persistent once it is established */
99 	for (i = 0; i < topo_info.nr_assigned_cpus; i++) {
100 		if (cpuid_to_apicid[i] == apic_id)
101 			return i;
102 	}
103 	return -ENODEV;
104 }
105 
106 static __init int topo_get_cpunr(u32 apic_id)
107 {
108 	int cpu = topo_lookup_cpuid(apic_id);
109 
110 	if (cpu >= 0)
111 		return cpu;
112 
113 	return topo_info.nr_assigned_cpus++;
114 }
115 
116 static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id)
117 {
118 #if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
119 	early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id;
120 	early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id;
121 #endif
122 	set_cpu_present(cpu, true);
123 }
124 
125 static __init bool check_for_real_bsp(u32 apic_id)
126 {
127 	bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6;
128 	u64 msr;
129 
130 	/*
131 	 * There is no real good way to detect whether this a kdump()
132 	 * kernel, but except on the Voyager SMP monstrosity which is not
133 	 * longer supported, the real BSP APIC ID is the first one which is
134 	 * enumerated by firmware. That allows to detect whether the boot
135 	 * CPU is the real BSP. If it is not, then do not register the APIC
136 	 * because sending INIT to the real BSP would reset the whole
137 	 * system.
138 	 *
139 	 * The first APIC ID which is enumerated by firmware is detectable
140 	 * because the boot CPU APIC ID is registered before that without
141 	 * invoking this code.
142 	 */
143 	if (topo_info.real_bsp_apic_id != BAD_APICID)
144 		return false;
145 
146 	/*
147 	 * Check whether the enumeration order is broken by evaluating the
148 	 * BSP bit in the APICBASE MSR. If the CPU does not have the
149 	 * APICBASE MSR then the BSP detection is not possible and the
150 	 * kernel must rely on the firmware enumeration order.
151 	 */
152 	if (has_apic_base) {
153 		rdmsrq(MSR_IA32_APICBASE, msr);
154 		is_bsp = !!(msr & MSR_IA32_APICBASE_BSP);
155 	}
156 
157 	if (apic_id == topo_info.boot_cpu_apic_id) {
158 		/*
159 		 * If the boot CPU has the APIC BSP bit set then the
160 		 * firmware enumeration is agreeing. If the CPU does not
161 		 * have the APICBASE MSR then the only choice is to trust
162 		 * the enumeration order.
163 		 */
164 		if (is_bsp || !has_apic_base) {
165 			topo_info.real_bsp_apic_id = apic_id;
166 			return false;
167 		}
168 		/*
169 		 * If the boot APIC is enumerated first, but the APICBASE
170 		 * MSR does not have the BSP bit set, then there is no way
171 		 * to discover the real BSP here. Assume a crash kernel and
172 		 * limit the number of CPUs to 1 as an INIT to the real BSP
173 		 * would reset the machine.
174 		 */
175 		pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id);
176 		pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n");
177 		set_nr_cpu_ids(1);
178 		goto fwbug;
179 	}
180 
181 	pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n",
182 		topo_info.boot_cpu_apic_id, apic_id);
183 
184 	if (is_bsp) {
185 		/*
186 		 * The boot CPU has the APIC BSP bit set. Use it and complain
187 		 * about the broken firmware enumeration.
188 		 */
189 		topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id;
190 		goto fwbug;
191 	}
192 
193 	pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n");
194 
195 	topo_info.real_bsp_apic_id = apic_id;
196 	return true;
197 
198 fwbug:
199 	pr_warn(FW_BUG "APIC enumeration order not specification compliant\n");
200 	return false;
201 }
202 
203 static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level,
204 				    unsigned long *map)
205 {
206 	unsigned int id, end, cnt = 0;
207 
208 	/* Calculate the exclusive end */
209 	end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]);
210 
211 	/* Unfortunately there is no bitmap_weight_range() */
212 	for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id))
213 		cnt++;
214 	return cnt;
215 }
216 
217 static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
218 {
219 	int cpu, dom;
220 
221 	if (present) {
222 		set_bit(apic_id, phys_cpu_present_map);
223 
224 		/*
225 		 * Double registration is valid in case of the boot CPU
226 		 * APIC because that is registered before the enumeration
227 		 * of the APICs via firmware parsers or VM guest
228 		 * mechanisms.
229 		 */
230 		if (apic_id == topo_info.boot_cpu_apic_id)
231 			cpu = 0;
232 		else
233 			cpu = topo_get_cpunr(apic_id);
234 
235 		cpuid_to_apicid[cpu] = apic_id;
236 		topo_set_cpuids(cpu, apic_id, acpi_id);
237 	} else {
238 		topo_info.nr_disabled_cpus++;
239 	}
240 
241 	/*
242 	 * Register present and possible CPUs in the domain
243 	 * maps. cpu_possible_map will be updated in
244 	 * topology_init_possible_cpus() after enumeration is done.
245 	 */
246 	for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++)
247 		set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map);
248 }
249 
250 /**
251  * topology_register_apic - Register an APIC in early topology maps
252  * @apic_id:	The APIC ID to set up
253  * @acpi_id:	The ACPI ID associated to the APIC
254  * @present:	True if the corresponding CPU is present
255  */
256 void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present)
257 {
258 	if (apic_id >= MAX_LOCAL_APIC) {
259 		pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1);
260 		topo_info.nr_rejected_cpus++;
261 		return;
262 	}
263 
264 	if (check_for_real_bsp(apic_id)) {
265 		topo_info.nr_rejected_cpus++;
266 		return;
267 	}
268 
269 	/* CPU numbers exhausted? */
270 	if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) {
271 		pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids);
272 		topo_info.nr_rejected_cpus++;
273 		return;
274 	}
275 
276 	topo_register_apic(apic_id, acpi_id, present);
277 }
278 
279 /**
280  * topology_register_boot_apic - Register the boot CPU APIC
281  * @apic_id:	The APIC ID to set up
282  *
283  * Separate so CPU #0 can be assigned
284  */
285 void __init topology_register_boot_apic(u32 apic_id)
286 {
287 	WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID);
288 
289 	topo_info.boot_cpu_apic_id = apic_id;
290 	topo_register_apic(apic_id, CPU_ACPIID_INVALID, true);
291 }
292 
293 /**
294  * topology_get_logical_id - Retrieve the logical ID at a given topology domain level
295  * @apicid:		The APIC ID for which to lookup the logical ID
296  * @at_level:		The topology domain level to use
297  *
298  * @apicid must be a full APIC ID, not the normalized variant. It's valid to have
299  * all bits below the domain level specified by @at_level to be clear. So both
300  * real APIC IDs and backshifted normalized APIC IDs work correctly.
301  *
302  * Returns:
303  *  - >= 0:	The requested logical ID
304  *  - -ERANGE:	@apicid is out of range
305  *  - -ENODEV:	@apicid is not registered
306  */
307 int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level)
308 {
309 	/* Remove the bits below @at_level to get the proper level ID of @apicid */
310 	unsigned int lvlid = topo_apicid(apicid, at_level);
311 
312 	if (lvlid >= MAX_LOCAL_APIC)
313 		return -ERANGE;
314 	if (!test_bit(lvlid, apic_maps[at_level].map))
315 		return -ENODEV;
316 	/* Get the number of set bits before @lvlid. */
317 	return bitmap_weight(apic_maps[at_level].map, lvlid);
318 }
319 EXPORT_SYMBOL_GPL(topology_get_logical_id);
320 
321 /**
322  * topology_unit_count - Retrieve the count of specified units at a given topology domain level
323  * @apicid:		The APIC ID which specifies the search range
324  * @which_units:	The domain level specifying the units to count
325  * @at_level:		The domain level at which @which_units have to be counted
326  *
327  * This returns the number of possible units according to the enumerated
328  * information.
329  *
330  * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN)
331  * counts the number of possible cores in the package to which @apicid
332  * belongs.
333  *
334  * @at_level must obviously be greater than @which_level to produce useful
335  * results.  If @at_level is equal to @which_units the result is
336  * unsurprisingly 1. If @at_level is less than @which_units the results
337  * is by definition undefined and the function returns 0.
338  */
339 unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
340 				 enum x86_topology_domains at_level)
341 {
342 	/* Remove the bits below @at_level to get the proper level ID of @apicid */
343 	unsigned int lvlid = topo_apicid(apicid, at_level);
344 
345 	if (lvlid >= MAX_LOCAL_APIC)
346 		return 0;
347 	if (!test_bit(lvlid, apic_maps[at_level].map))
348 		return 0;
349 	if (which_units > at_level)
350 		return 0;
351 	if (which_units == at_level)
352 		return 1;
353 	return topo_unit_count(lvlid, at_level, apic_maps[which_units].map);
354 }
355 
356 #ifdef CONFIG_SMP
357 int topology_get_primary_thread(unsigned int cpu)
358 {
359 	u32 apic_id = cpuid_to_apicid[cpu];
360 
361 	/*
362 	 * Get the core domain level APIC id, which is the primary thread
363 	 * and return the CPU number assigned to it.
364 	 */
365 	return topo_lookup_cpuid(topo_apicid(apic_id, TOPO_CORE_DOMAIN));
366 }
367 #endif
368 
369 #ifdef CONFIG_ACPI_HOTPLUG_CPU
370 /**
371  * topology_hotplug_apic - Handle a physical hotplugged APIC after boot
372  * @apic_id:	The APIC ID to set up
373  * @acpi_id:	The ACPI ID associated to the APIC
374  */
375 int topology_hotplug_apic(u32 apic_id, u32 acpi_id)
376 {
377 	int cpu;
378 
379 	if (apic_id >= MAX_LOCAL_APIC)
380 		return -EINVAL;
381 
382 	/* Reject if the APIC ID was not registered during enumeration. */
383 	if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map))
384 		return -ENODEV;
385 
386 	cpu = topo_lookup_cpuid(apic_id);
387 	if (cpu < 0)
388 		return -ENOSPC;
389 
390 	set_bit(apic_id, phys_cpu_present_map);
391 	topo_set_cpuids(cpu, apic_id, acpi_id);
392 	cpu_mark_primary_thread(cpu, apic_id);
393 	return cpu;
394 }
395 
396 /**
397  * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot
398  * @cpu:	The CPU number for which the APIC ID is removed
399  */
400 void topology_hotunplug_apic(unsigned int cpu)
401 {
402 	u32 apic_id = cpuid_to_apicid[cpu];
403 
404 	if (apic_id == BAD_APICID)
405 		return;
406 
407 	per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
408 	clear_bit(apic_id, phys_cpu_present_map);
409 	set_cpu_present(cpu, false);
410 }
411 #endif
412 
413 #ifdef CONFIG_X86_LOCAL_APIC
414 static unsigned int max_possible_cpus __initdata = NR_CPUS;
415 
416 /**
417  * topology_apply_cmdline_limits_early - Apply topology command line limits early
418  *
419  * Ensure that command line limits are in effect before firmware parsing
420  * takes place.
421  */
422 void __init topology_apply_cmdline_limits_early(void)
423 {
424 	unsigned int possible = nr_cpu_ids;
425 
426 	/* 'maxcpus=0' 'nosmp' 'nolapic' */
427 	if (!setup_max_cpus || apic_is_disabled)
428 		possible = 1;
429 
430 	/* 'possible_cpus=N' */
431 	possible = min_t(unsigned int, max_possible_cpus, possible);
432 
433 	if (possible < nr_cpu_ids) {
434 		pr_info("Limiting to %u possible CPUs\n", possible);
435 		set_nr_cpu_ids(possible);
436 	}
437 }
438 
439 static __init bool restrict_to_up(void)
440 {
441 	if (!smp_found_config)
442 		return true;
443 	/*
444 	 * XEN PV is special as it does not advertise the local APIC
445 	 * properly, but provides a fake topology for it so that the
446 	 * infrastructure works. So don't apply the restrictions vs. APIC
447 	 * here.
448 	 */
449 	if (xen_pv_domain())
450 		return false;
451 
452 	return apic_is_disabled;
453 }
454 
455 void __init topology_init_possible_cpus(void)
456 {
457 	unsigned int assigned = topo_info.nr_assigned_cpus;
458 	unsigned int disabled = topo_info.nr_disabled_cpus;
459 	unsigned int cnta, cntb, cpu, allowed = 1;
460 	unsigned int total = assigned + disabled;
461 	u32 apicid, firstid;
462 
463 	/*
464 	 * If there was no APIC registered, then fake one so that the
465 	 * topology bitmap is populated. That ensures that the code below
466 	 * is valid and the various query interfaces can be used
467 	 * unconditionally. This does not affect the actual APIC code in
468 	 * any way because either the local APIC address has not been
469 	 * registered or the local APIC was disabled on the command line.
470 	 */
471 	if (topo_info.boot_cpu_apic_id == BAD_APICID)
472 		topology_register_boot_apic(0);
473 
474 	if (!restrict_to_up()) {
475 		if (WARN_ON_ONCE(assigned > nr_cpu_ids)) {
476 			disabled += assigned - nr_cpu_ids;
477 			assigned = nr_cpu_ids;
478 		}
479 		allowed = min_t(unsigned int, total, nr_cpu_ids);
480 	}
481 
482 	if (total > allowed)
483 		pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed);
484 
485 	assigned = min_t(unsigned int, allowed, assigned);
486 	disabled = allowed - assigned;
487 
488 	topo_info.nr_assigned_cpus = assigned;
489 	topo_info.nr_disabled_cpus = disabled;
490 
491 	total_cpus = allowed;
492 	set_nr_cpu_ids(allowed);
493 
494 	cnta = domain_weight(TOPO_PKG_DOMAIN);
495 	cntb = domain_weight(TOPO_DIE_DOMAIN);
496 	__max_logical_packages = cnta;
497 	__max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));
498 
499 	pr_info("Max. logical packages: %3u\n", cnta);
500 	pr_info("Max. logical dies:     %3u\n", cntb);
501 	pr_info("Max. dies per package: %3u\n", __max_dies_per_package);
502 
503 	cnta = domain_weight(TOPO_CORE_DOMAIN);
504 	cntb = domain_weight(TOPO_SMT_DOMAIN);
505 	/*
506 	 * Can't use order delta here as order(cnta) can be equal
507 	 * order(cntb) even if cnta != cntb.
508 	 */
509 	__max_threads_per_core = DIV_ROUND_UP(cntb, cnta);
510 	pr_info("Max. threads per core: %3u\n", __max_threads_per_core);
511 
512 	firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC);
513 	__num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN);
514 	pr_info("Num. cores per package:   %3u\n", __num_cores_per_package);
515 	__num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN);
516 	pr_info("Num. threads per package: %3u\n", __num_threads_per_package);
517 
518 	pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled);
519 	if (topo_info.nr_rejected_cpus)
520 		pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus);
521 
522 	init_cpu_present(cpumask_of(0));
523 	init_cpu_possible(cpumask_of(0));
524 
525 	/* Assign CPU numbers to non-present CPUs */
526 	for (apicid = 0; disabled; disabled--, apicid++) {
527 		apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map,
528 					      MAX_LOCAL_APIC, apicid);
529 		if (apicid >= MAX_LOCAL_APIC)
530 			break;
531 		cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid;
532 	}
533 
534 	for (cpu = 0; cpu < allowed; cpu++) {
535 		apicid = cpuid_to_apicid[cpu];
536 
537 		set_cpu_possible(cpu, true);
538 
539 		if (apicid == BAD_APICID)
540 			continue;
541 
542 		cpu_mark_primary_thread(cpu, apicid);
543 		set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map));
544 	}
545 }
546 
547 /*
548  * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed.
549  */
550 void __init topology_reset_possible_cpus_up(void)
551 {
552 	init_cpu_present(cpumask_of(0));
553 	init_cpu_possible(cpumask_of(0));
554 
555 	bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC);
556 	if (topo_info.boot_cpu_apic_id != BAD_APICID)
557 		set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map);
558 }
559 
560 static int __init setup_possible_cpus(char *str)
561 {
562 	get_option(&str, &max_possible_cpus);
563 	return 0;
564 }
565 early_param("possible_cpus", setup_possible_cpus);
566 #endif
567