1 // SPDX-License-Identifier: GPL-2.0-or-later
2 #include <linux/acpi.h>
3 #include <linux/cpu.h>
4 #include <linux/delay.h>
5 #include <linux/io.h>
6 #include <linux/kexec.h>
7 #include <linux/memblock.h>
8 #include <linux/pgtable.h>
9 #include <linux/sched/hotplug.h>
10 #include <asm/apic.h>
11 #include <asm/barrier.h>
12 #include <asm/init.h>
13 #include <asm/intel_pt.h>
14 #include <asm/nmi.h>
15 #include <asm/processor.h>
16 #include <asm/reboot.h>
17
18 /* Physical address of the Multiprocessor Wakeup Structure mailbox */
19 static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
20
21 /* Virtual address of the Multiprocessor Wakeup Structure mailbox */
22 static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
23
24 static u64 acpi_mp_pgd __ro_after_init;
25 static u64 acpi_mp_reset_vector_paddr __ro_after_init;
26
acpi_mp_stop_this_cpu(void)27 static void acpi_mp_stop_this_cpu(void)
28 {
29 asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
30 }
31
acpi_mp_play_dead(void)32 static void acpi_mp_play_dead(void)
33 {
34 play_dead_common();
35 asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
36 }
37
acpi_mp_cpu_die(unsigned int cpu)38 static void acpi_mp_cpu_die(unsigned int cpu)
39 {
40 u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
41 unsigned long timeout;
42
43 /*
44 * Use TEST mailbox command to prove that BIOS got control over
45 * the CPU before declaring it dead.
46 *
47 * BIOS has to clear 'command' field of the mailbox.
48 */
49 acpi_mp_wake_mailbox->apic_id = apicid;
50 smp_store_release(&acpi_mp_wake_mailbox->command,
51 ACPI_MP_WAKE_COMMAND_TEST);
52
53 /* Don't wait longer than a second. */
54 timeout = USEC_PER_SEC;
55 while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
56 udelay(1);
57
58 if (!timeout)
59 pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
60 }
61
62 /* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
alloc_pgt_page(void * dummy)63 static void __init *alloc_pgt_page(void *dummy)
64 {
65 return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
66 }
67
free_pgt_page(void * pgt,void * dummy)68 static void __init free_pgt_page(void *pgt, void *dummy)
69 {
70 return memblock_free(pgt, PAGE_SIZE);
71 }
72
73 /*
74 * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
75 * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
76 * to the identity mapping and the function has be present at the same spot in
77 * the virtual address space before and after switching page tables.
78 */
init_transition_pgtable(pgd_t * pgd)79 static int __init init_transition_pgtable(pgd_t *pgd)
80 {
81 pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
82 unsigned long vaddr, paddr;
83 p4d_t *p4d;
84 pud_t *pud;
85 pmd_t *pmd;
86 pte_t *pte;
87
88 vaddr = (unsigned long)asm_acpi_mp_play_dead;
89 pgd += pgd_index(vaddr);
90 if (!pgd_present(*pgd)) {
91 p4d = (p4d_t *)alloc_pgt_page(NULL);
92 if (!p4d)
93 return -ENOMEM;
94 set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
95 }
96 p4d = p4d_offset(pgd, vaddr);
97 if (!p4d_present(*p4d)) {
98 pud = (pud_t *)alloc_pgt_page(NULL);
99 if (!pud)
100 return -ENOMEM;
101 set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
102 }
103 pud = pud_offset(p4d, vaddr);
104 if (!pud_present(*pud)) {
105 pmd = (pmd_t *)alloc_pgt_page(NULL);
106 if (!pmd)
107 return -ENOMEM;
108 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
109 }
110 pmd = pmd_offset(pud, vaddr);
111 if (!pmd_present(*pmd)) {
112 pte = (pte_t *)alloc_pgt_page(NULL);
113 if (!pte)
114 return -ENOMEM;
115 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
116 }
117 pte = pte_offset_kernel(pmd, vaddr);
118
119 paddr = __pa(vaddr);
120 set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
121
122 return 0;
123 }
124
acpi_mp_setup_reset(u64 reset_vector)125 static int __init acpi_mp_setup_reset(u64 reset_vector)
126 {
127 struct x86_mapping_info info = {
128 .alloc_pgt_page = alloc_pgt_page,
129 .free_pgt_page = free_pgt_page,
130 .page_flag = __PAGE_KERNEL_LARGE_EXEC,
131 .kernpg_flag = _KERNPG_TABLE_NOENC,
132 };
133 pgd_t *pgd;
134
135 pgd = alloc_pgt_page(NULL);
136 if (!pgd)
137 return -ENOMEM;
138
139 for (int i = 0; i < nr_pfn_mapped; i++) {
140 unsigned long mstart, mend;
141
142 mstart = pfn_mapped[i].start << PAGE_SHIFT;
143 mend = pfn_mapped[i].end << PAGE_SHIFT;
144 if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
145 kernel_ident_mapping_free(&info, pgd);
146 return -ENOMEM;
147 }
148 }
149
150 if (kernel_ident_mapping_init(&info, pgd,
151 PAGE_ALIGN_DOWN(reset_vector),
152 PAGE_ALIGN(reset_vector + 1))) {
153 kernel_ident_mapping_free(&info, pgd);
154 return -ENOMEM;
155 }
156
157 if (init_transition_pgtable(pgd)) {
158 kernel_ident_mapping_free(&info, pgd);
159 return -ENOMEM;
160 }
161
162 smp_ops.play_dead = acpi_mp_play_dead;
163 smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
164 smp_ops.cpu_die = acpi_mp_cpu_die;
165
166 acpi_mp_reset_vector_paddr = reset_vector;
167 acpi_mp_pgd = __pa(pgd);
168
169 return 0;
170 }
171
acpi_wakeup_cpu(u32 apicid,unsigned long start_ip)172 static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
173 {
174 if (!acpi_mp_wake_mailbox_paddr) {
175 pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n");
176 return -EOPNOTSUPP;
177 }
178
179 /*
180 * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
181 *
182 * Wakeup of secondary CPUs is fully serialized in the core code.
183 * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
184 */
185 if (!acpi_mp_wake_mailbox) {
186 acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
187 sizeof(*acpi_mp_wake_mailbox),
188 MEMREMAP_WB);
189 }
190
191 /*
192 * Mailbox memory is shared between the firmware and OS. Firmware will
193 * listen on mailbox command address, and once it receives the wakeup
194 * command, the CPU associated with the given apicid will be booted.
195 *
196 * The value of 'apic_id' and 'wakeup_vector' must be visible to the
197 * firmware before the wakeup command is visible. smp_store_release()
198 * ensures ordering and visibility.
199 */
200 acpi_mp_wake_mailbox->apic_id = apicid;
201 acpi_mp_wake_mailbox->wakeup_vector = start_ip;
202 smp_store_release(&acpi_mp_wake_mailbox->command,
203 ACPI_MP_WAKE_COMMAND_WAKEUP);
204
205 /*
206 * Wait for the CPU to wake up.
207 *
208 * The CPU being woken up is essentially in a spin loop waiting to be
209 * woken up. It should not take long for it wake up and acknowledge by
210 * zeroing out ->command.
211 *
212 * ACPI specification doesn't provide any guidance on how long kernel
213 * has to wait for a wake up acknowledgment. It also doesn't provide
214 * a way to cancel a wake up request if it takes too long.
215 *
216 * In TDX environment, the VMM has control over how long it takes to
217 * wake up secondary. It can postpone scheduling secondary vCPU
218 * indefinitely. Giving up on wake up request and reporting error opens
219 * possible attack vector for VMM: it can wake up a secondary CPU when
220 * kernel doesn't expect it. Wait until positive result of the wake up
221 * request.
222 */
223 while (READ_ONCE(acpi_mp_wake_mailbox->command))
224 cpu_relax();
225
226 return 0;
227 }
228
acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup * mp_wake)229 static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
230 {
231 cpu_hotplug_disable_offlining();
232
233 /*
234 * ACPI MADT doesn't allow to offline a CPU after it was onlined. This
235 * limits kexec: the second kernel won't be able to use more than one CPU.
236 *
237 * To prevent a kexec kernel from onlining secondary CPUs invalidate the
238 * mailbox address in the ACPI MADT wakeup structure which prevents a
239 * kexec kernel to use it.
240 *
241 * This is safe as the booting kernel has the mailbox address cached
242 * already and acpi_wakeup_cpu() uses the cached value to bring up the
243 * secondary CPUs.
244 *
245 * Note: This is a Linux specific convention and not covered by the
246 * ACPI specification.
247 */
248 mp_wake->mailbox_address = 0;
249 }
250
acpi_parse_mp_wake(union acpi_subtable_headers * header,const unsigned long end)251 int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
252 const unsigned long end)
253 {
254 struct acpi_madt_multiproc_wakeup *mp_wake;
255
256 mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
257
258 /*
259 * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
260 * entry. 'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
261 * than the actual size of the MP wakeup entry in ACPI table because the
262 * 'reset_vector' is only available in the V1 MP wakeup structure.
263 */
264 if (!mp_wake)
265 return -EINVAL;
266 if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
267 return -EINVAL;
268 if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
269 return -EINVAL;
270
271 acpi_table_print_madt_entry(&header->common);
272
273 acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;
274
275 if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
276 mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
277 if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
278 pr_warn("Failed to setup MADT reset vector\n");
279 acpi_mp_disable_offlining(mp_wake);
280 }
281 } else {
282 /*
283 * CPU offlining requires version 1 of the ACPI MADT wakeup
284 * structure.
285 */
286 acpi_mp_disable_offlining(mp_wake);
287 }
288
289 apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
290
291 return 0;
292 }
293