xref: /linux/arch/x86/kernel/cpu/topology.c (revision 30bbcb44707a97fcb62246bebc8b413b5ab293f8)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * CPU/APIC topology
4  *
5  * The APIC IDs describe the system topology in multiple domain levels.
6  * The CPUID topology parser provides the information which part of the
7  * APIC ID is associated to the individual levels:
8  *
9  * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD]
10  *
11  * The root space contains the package (socket) IDs.
12  *
13  * Not enumerated levels consume 0 bits space, but conceptually they are
14  * always represented. If e.g. only CORE and THREAD levels are enumerated
15  * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE.
16  *
17  * If SMT is not supported, then the THREAD domain is still used. It then
18  * has the same physical ID as the CORE domain and is the only child of
19  * the core domain.
20  *
21  * This allows a unified view on the system independent of the enumerated
22  * domain levels without requiring any conditionals in the code.
23  */
24 #define pr_fmt(fmt) "CPU topo: " fmt
25 #include <linux/cpu.h>
26 
27 #include <xen/xen.h>
28 
29 #include <asm/apic.h>
30 #include <asm/hypervisor.h>
31 #include <asm/io_apic.h>
32 #include <asm/mpspec.h>
33 #include <asm/msr.h>
34 #include <asm/smp.h>
35 
36 #include "cpu.h"
37 
38 /*
39  * Map cpu index to physical APIC ID
40  */
41 DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
42 DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID);
43 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
44 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
45 
46 /* Bitmap of physically present CPUs. */
47 DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly;
48 
49 /* Used for CPU number allocation and parallel CPU bringup */
50 u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, };
51 
52 /* Bitmaps to mark registered APICs at each topology domain */
53 static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init;
54 
55 /*
56  * Keep track of assigned, disabled and rejected CPUs. Present assigned
57  * with 1 as CPU #0 is reserved for the boot CPU.
58  */
59 static struct {
60 	unsigned int		nr_assigned_cpus;
61 	unsigned int		nr_disabled_cpus;
62 	unsigned int		nr_rejected_cpus;
63 	u32			boot_cpu_apic_id;
64 	u32			real_bsp_apic_id;
65 } topo_info __ro_after_init = {
66 	.nr_assigned_cpus	= 1,
67 	.boot_cpu_apic_id	= BAD_APICID,
68 	.real_bsp_apic_id	= BAD_APICID,
69 };
70 
71 #define domain_weight(_dom)	bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC)
72 
73 bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
74 {
75 	return phys_id == (u64)cpuid_to_apicid[cpu];
76 }
77 
78 #ifdef CONFIG_SMP
79 static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
80 {
81 	if (!(apicid & (__max_threads_per_core - 1)))
82 		cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
83 }
84 #else
85 static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { }
86 #endif
87 
88 /*
89  * Convert the APIC ID to a domain level ID by masking out the low bits
90  * below the domain level @dom.
91  */
92 static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom)
93 {
94 	if (dom == TOPO_SMT_DOMAIN)
95 		return apicid;
96 	return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]);
97 }
98 
99 static int topo_lookup_cpuid(u32 apic_id)
100 {
101 	int i;
102 
103 	/* CPU# to APICID mapping is persistent once it is established */
104 	for (i = 0; i < topo_info.nr_assigned_cpus; i++) {
105 		if (cpuid_to_apicid[i] == apic_id)
106 			return i;
107 	}
108 	return -ENODEV;
109 }
110 
111 static __init int topo_get_cpunr(u32 apic_id)
112 {
113 	int cpu = topo_lookup_cpuid(apic_id);
114 
115 	if (cpu >= 0)
116 		return cpu;
117 
118 	return topo_info.nr_assigned_cpus++;
119 }
120 
121 static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id)
122 {
123 #if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
124 	early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id;
125 	early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id;
126 #endif
127 	set_cpu_present(cpu, true);
128 }
129 
130 static __init bool check_for_real_bsp(u32 apic_id)
131 {
132 	bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6;
133 	u64 msr;
134 
135 	/*
136 	 * There is no real good way to detect whether this a kdump()
137 	 * kernel, but except on the Voyager SMP monstrosity which is not
138 	 * longer supported, the real BSP APIC ID is the first one which is
139 	 * enumerated by firmware. That allows to detect whether the boot
140 	 * CPU is the real BSP. If it is not, then do not register the APIC
141 	 * because sending INIT to the real BSP would reset the whole
142 	 * system.
143 	 *
144 	 * The first APIC ID which is enumerated by firmware is detectable
145 	 * because the boot CPU APIC ID is registered before that without
146 	 * invoking this code.
147 	 */
148 	if (topo_info.real_bsp_apic_id != BAD_APICID)
149 		return false;
150 
151 	/*
152 	 * Check whether the enumeration order is broken by evaluating the
153 	 * BSP bit in the APICBASE MSR. If the CPU does not have the
154 	 * APICBASE MSR then the BSP detection is not possible and the
155 	 * kernel must rely on the firmware enumeration order.
156 	 */
157 	if (has_apic_base) {
158 		rdmsrq(MSR_IA32_APICBASE, msr);
159 		is_bsp = !!(msr & MSR_IA32_APICBASE_BSP);
160 	}
161 
162 	if (apic_id == topo_info.boot_cpu_apic_id) {
163 		/*
164 		 * If the boot CPU has the APIC BSP bit set then the
165 		 * firmware enumeration is agreeing. If the CPU does not
166 		 * have the APICBASE MSR then the only choice is to trust
167 		 * the enumeration order.
168 		 */
169 		if (is_bsp || !has_apic_base) {
170 			topo_info.real_bsp_apic_id = apic_id;
171 			return false;
172 		}
173 		/*
174 		 * If the boot APIC is enumerated first, but the APICBASE
175 		 * MSR does not have the BSP bit set, then there is no way
176 		 * to discover the real BSP here. Assume a crash kernel and
177 		 * limit the number of CPUs to 1 as an INIT to the real BSP
178 		 * would reset the machine.
179 		 */
180 		pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id);
181 		pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n");
182 		set_nr_cpu_ids(1);
183 		goto fwbug;
184 	}
185 
186 	pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n",
187 		topo_info.boot_cpu_apic_id, apic_id);
188 
189 	if (is_bsp) {
190 		/*
191 		 * The boot CPU has the APIC BSP bit set. Use it and complain
192 		 * about the broken firmware enumeration.
193 		 */
194 		topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id;
195 		goto fwbug;
196 	}
197 
198 	pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n");
199 
200 	topo_info.real_bsp_apic_id = apic_id;
201 	return true;
202 
203 fwbug:
204 	pr_warn(FW_BUG "APIC enumeration order not specification compliant\n");
205 	return false;
206 }
207 
208 static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level,
209 				    unsigned long *map)
210 {
211 	unsigned int id, end, cnt = 0;
212 
213 	/* Calculate the exclusive end */
214 	end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]);
215 
216 	/* Unfortunately there is no bitmap_weight_range() */
217 	for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id))
218 		cnt++;
219 	return cnt;
220 }
221 
222 static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
223 {
224 	int cpu, dom;
225 
226 	if (present) {
227 		set_bit(apic_id, phys_cpu_present_map);
228 
229 		/*
230 		 * Double registration is valid in case of the boot CPU
231 		 * APIC because that is registered before the enumeration
232 		 * of the APICs via firmware parsers or VM guest
233 		 * mechanisms.
234 		 */
235 		if (apic_id == topo_info.boot_cpu_apic_id)
236 			cpu = 0;
237 		else
238 			cpu = topo_get_cpunr(apic_id);
239 
240 		cpuid_to_apicid[cpu] = apic_id;
241 		topo_set_cpuids(cpu, apic_id, acpi_id);
242 	} else {
243 		u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN);
244 
245 		/*
246 		 * Check for present APICs in the same package when running
247 		 * on bare metal. Allow the bogosity in a guest.
248 		 */
249 		if (hypervisor_is_type(X86_HYPER_NATIVE) &&
250 		    topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) {
251 			pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n",
252 				     apic_id);
253 			topo_info.nr_rejected_cpus++;
254 			return;
255 		}
256 
257 		topo_info.nr_disabled_cpus++;
258 	}
259 
260 	/*
261 	 * Register present and possible CPUs in the domain
262 	 * maps. cpu_possible_map will be updated in
263 	 * topology_init_possible_cpus() after enumeration is done.
264 	 */
265 	for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++)
266 		set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map);
267 }
268 
269 /**
270  * topology_register_apic - Register an APIC in early topology maps
271  * @apic_id:	The APIC ID to set up
272  * @acpi_id:	The ACPI ID associated to the APIC
273  * @present:	True if the corresponding CPU is present
274  */
275 void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present)
276 {
277 	if (apic_id >= MAX_LOCAL_APIC) {
278 		pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1);
279 		topo_info.nr_rejected_cpus++;
280 		return;
281 	}
282 
283 	if (check_for_real_bsp(apic_id)) {
284 		topo_info.nr_rejected_cpus++;
285 		return;
286 	}
287 
288 	/* CPU numbers exhausted? */
289 	if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) {
290 		pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids);
291 		topo_info.nr_rejected_cpus++;
292 		return;
293 	}
294 
295 	topo_register_apic(apic_id, acpi_id, present);
296 }
297 
298 /**
299  * topology_register_boot_apic - Register the boot CPU APIC
300  * @apic_id:	The APIC ID to set up
301  *
302  * Separate so CPU #0 can be assigned
303  */
304 void __init topology_register_boot_apic(u32 apic_id)
305 {
306 	WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID);
307 
308 	topo_info.boot_cpu_apic_id = apic_id;
309 	topo_register_apic(apic_id, CPU_ACPIID_INVALID, true);
310 }
311 
312 /**
313  * topology_get_logical_id - Retrieve the logical ID at a given topology domain level
314  * @apicid:		The APIC ID for which to lookup the logical ID
315  * @at_level:		The topology domain level to use
316  *
317  * @apicid must be a full APIC ID, not the normalized variant. It's valid to have
318  * all bits below the domain level specified by @at_level to be clear. So both
319  * real APIC IDs and backshifted normalized APIC IDs work correctly.
320  *
321  * Returns:
322  *  - >= 0:	The requested logical ID
323  *  - -ERANGE:	@apicid is out of range
324  *  - -ENODEV:	@apicid is not registered
325  */
326 int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level)
327 {
328 	/* Remove the bits below @at_level to get the proper level ID of @apicid */
329 	unsigned int lvlid = topo_apicid(apicid, at_level);
330 
331 	if (lvlid >= MAX_LOCAL_APIC)
332 		return -ERANGE;
333 	if (!test_bit(lvlid, apic_maps[at_level].map))
334 		return -ENODEV;
335 	/* Get the number of set bits before @lvlid. */
336 	return bitmap_weight(apic_maps[at_level].map, lvlid);
337 }
338 EXPORT_SYMBOL_GPL(topology_get_logical_id);
339 
340 /**
341  * topology_unit_count - Retrieve the count of specified units at a given topology domain level
342  * @apicid:		The APIC ID which specifies the search range
343  * @which_units:	The domain level specifying the units to count
344  * @at_level:		The domain level at which @which_units have to be counted
345  *
346  * This returns the number of possible units according to the enumerated
347  * information.
348  *
349  * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN)
350  * counts the number of possible cores in the package to which @apicid
351  * belongs.
352  *
353  * @at_level must obviously be greater than @which_level to produce useful
354  * results.  If @at_level is equal to @which_units the result is
355  * unsurprisingly 1. If @at_level is less than @which_units the results
356  * is by definition undefined and the function returns 0.
357  */
358 unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
359 				 enum x86_topology_domains at_level)
360 {
361 	/* Remove the bits below @at_level to get the proper level ID of @apicid */
362 	unsigned int lvlid = topo_apicid(apicid, at_level);
363 
364 	if (lvlid >= MAX_LOCAL_APIC)
365 		return 0;
366 	if (!test_bit(lvlid, apic_maps[at_level].map))
367 		return 0;
368 	if (which_units > at_level)
369 		return 0;
370 	if (which_units == at_level)
371 		return 1;
372 	return topo_unit_count(lvlid, at_level, apic_maps[which_units].map);
373 }
374 
375 #ifdef CONFIG_SMP
376 int topology_get_primary_thread(unsigned int cpu)
377 {
378 	u32 apic_id = cpuid_to_apicid[cpu];
379 
380 	/*
381 	 * Get the core domain level APIC id, which is the primary thread
382 	 * and return the CPU number assigned to it.
383 	 */
384 	return topo_lookup_cpuid(topo_apicid(apic_id, TOPO_CORE_DOMAIN));
385 }
386 #endif
387 
388 #ifdef CONFIG_ACPI_HOTPLUG_CPU
389 /**
390  * topology_hotplug_apic - Handle a physical hotplugged APIC after boot
391  * @apic_id:	The APIC ID to set up
392  * @acpi_id:	The ACPI ID associated to the APIC
393  */
394 int topology_hotplug_apic(u32 apic_id, u32 acpi_id)
395 {
396 	int cpu;
397 
398 	if (apic_id >= MAX_LOCAL_APIC)
399 		return -EINVAL;
400 
401 	/* Reject if the APIC ID was not registered during enumeration. */
402 	if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map))
403 		return -ENODEV;
404 
405 	cpu = topo_lookup_cpuid(apic_id);
406 	if (cpu < 0)
407 		return -ENOSPC;
408 
409 	set_bit(apic_id, phys_cpu_present_map);
410 	topo_set_cpuids(cpu, apic_id, acpi_id);
411 	cpu_mark_primary_thread(cpu, apic_id);
412 	return cpu;
413 }
414 
415 /**
416  * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot
417  * @cpu:	The CPU number for which the APIC ID is removed
418  */
419 void topology_hotunplug_apic(unsigned int cpu)
420 {
421 	u32 apic_id = cpuid_to_apicid[cpu];
422 
423 	if (apic_id == BAD_APICID)
424 		return;
425 
426 	per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
427 	clear_bit(apic_id, phys_cpu_present_map);
428 	set_cpu_present(cpu, false);
429 }
430 #endif
431 
432 #ifdef CONFIG_X86_LOCAL_APIC
433 static unsigned int max_possible_cpus __initdata = NR_CPUS;
434 
435 /**
436  * topology_apply_cmdline_limits_early - Apply topology command line limits early
437  *
438  * Ensure that command line limits are in effect before firmware parsing
439  * takes place.
440  */
441 void __init topology_apply_cmdline_limits_early(void)
442 {
443 	unsigned int possible = nr_cpu_ids;
444 
445 	/* 'maxcpus=0' 'nosmp' 'nolapic' */
446 	if (!setup_max_cpus || apic_is_disabled)
447 		possible = 1;
448 
449 	/* 'possible_cpus=N' */
450 	possible = min_t(unsigned int, max_possible_cpus, possible);
451 
452 	if (possible < nr_cpu_ids) {
453 		pr_info("Limiting to %u possible CPUs\n", possible);
454 		set_nr_cpu_ids(possible);
455 	}
456 }
457 
458 static __init bool restrict_to_up(void)
459 {
460 	if (!smp_found_config)
461 		return true;
462 	/*
463 	 * XEN PV is special as it does not advertise the local APIC
464 	 * properly, but provides a fake topology for it so that the
465 	 * infrastructure works. So don't apply the restrictions vs. APIC
466 	 * here.
467 	 */
468 	if (xen_pv_domain())
469 		return false;
470 
471 	return apic_is_disabled;
472 }
473 
474 void __init topology_init_possible_cpus(void)
475 {
476 	unsigned int assigned = topo_info.nr_assigned_cpus;
477 	unsigned int disabled = topo_info.nr_disabled_cpus;
478 	unsigned int cnta, cntb, cpu, allowed = 1;
479 	unsigned int total = assigned + disabled;
480 	u32 apicid, firstid;
481 
482 	/*
483 	 * If there was no APIC registered, then fake one so that the
484 	 * topology bitmap is populated. That ensures that the code below
485 	 * is valid and the various query interfaces can be used
486 	 * unconditionally. This does not affect the actual APIC code in
487 	 * any way because either the local APIC address has not been
488 	 * registered or the local APIC was disabled on the command line.
489 	 */
490 	if (topo_info.boot_cpu_apic_id == BAD_APICID)
491 		topology_register_boot_apic(0);
492 
493 	if (!restrict_to_up()) {
494 		if (WARN_ON_ONCE(assigned > nr_cpu_ids)) {
495 			disabled += assigned - nr_cpu_ids;
496 			assigned = nr_cpu_ids;
497 		}
498 		allowed = min_t(unsigned int, total, nr_cpu_ids);
499 	}
500 
501 	if (total > allowed)
502 		pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed);
503 
504 	assigned = min_t(unsigned int, allowed, assigned);
505 	disabled = allowed - assigned;
506 
507 	topo_info.nr_assigned_cpus = assigned;
508 	topo_info.nr_disabled_cpus = disabled;
509 
510 	total_cpus = allowed;
511 	set_nr_cpu_ids(allowed);
512 
513 	cnta = domain_weight(TOPO_PKG_DOMAIN);
514 	cntb = domain_weight(TOPO_DIE_DOMAIN);
515 	__max_logical_packages = cnta;
516 	__max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));
517 
518 	pr_info("Max. logical packages: %3u\n", cnta);
519 	pr_info("Max. logical dies:     %3u\n", cntb);
520 	pr_info("Max. dies per package: %3u\n", __max_dies_per_package);
521 
522 	cnta = domain_weight(TOPO_CORE_DOMAIN);
523 	cntb = domain_weight(TOPO_SMT_DOMAIN);
524 	/*
525 	 * Can't use order delta here as order(cnta) can be equal
526 	 * order(cntb) even if cnta != cntb.
527 	 */
528 	__max_threads_per_core = DIV_ROUND_UP(cntb, cnta);
529 	pr_info("Max. threads per core: %3u\n", __max_threads_per_core);
530 
531 	firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC);
532 	__num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN);
533 	pr_info("Num. cores per package:   %3u\n", __num_cores_per_package);
534 	__num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN);
535 	pr_info("Num. threads per package: %3u\n", __num_threads_per_package);
536 
537 	pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled);
538 	if (topo_info.nr_rejected_cpus)
539 		pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus);
540 
541 	init_cpu_present(cpumask_of(0));
542 	init_cpu_possible(cpumask_of(0));
543 
544 	/* Assign CPU numbers to non-present CPUs */
545 	for (apicid = 0; disabled; disabled--, apicid++) {
546 		apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map,
547 					      MAX_LOCAL_APIC, apicid);
548 		if (apicid >= MAX_LOCAL_APIC)
549 			break;
550 		cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid;
551 	}
552 
553 	for (cpu = 0; cpu < allowed; cpu++) {
554 		apicid = cpuid_to_apicid[cpu];
555 
556 		set_cpu_possible(cpu, true);
557 
558 		if (apicid == BAD_APICID)
559 			continue;
560 
561 		cpu_mark_primary_thread(cpu, apicid);
562 		set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map));
563 	}
564 }
565 
566 /*
567  * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed.
568  */
569 void __init topology_reset_possible_cpus_up(void)
570 {
571 	init_cpu_present(cpumask_of(0));
572 	init_cpu_possible(cpumask_of(0));
573 
574 	bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC);
575 	if (topo_info.boot_cpu_apic_id != BAD_APICID)
576 		set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map);
577 }
578 
579 static int __init setup_possible_cpus(char *str)
580 {
581 	get_option(&str, &max_possible_cpus);
582 	return 0;
583 }
584 early_param("possible_cpus", setup_possible_cpus);
585 #endif
586