1 // SPDX-License-Identifier: GPL-2.0-or-later 2 #include <linux/acpi.h> 3 #include <linux/cpu.h> 4 #include <linux/delay.h> 5 #include <linux/io.h> 6 #include <linux/kexec.h> 7 #include <linux/memblock.h> 8 #include <linux/pgtable.h> 9 #include <linux/sched/hotplug.h> 10 #include <asm/apic.h> 11 #include <asm/barrier.h> 12 #include <asm/init.h> 13 #include <asm/intel_pt.h> 14 #include <asm/nmi.h> 15 #include <asm/processor.h> 16 #include <asm/reboot.h> 17 18 /* Physical address of the Multiprocessor Wakeup Structure mailbox */ 19 static u64 acpi_mp_wake_mailbox_paddr __ro_after_init; 20 21 /* Virtual address of the Multiprocessor Wakeup Structure mailbox */ 22 static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; 23 24 static u64 acpi_mp_pgd __ro_after_init; 25 static u64 acpi_mp_reset_vector_paddr __ro_after_init; 26 27 static void acpi_mp_stop_this_cpu(void) 28 { 29 asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd); 30 } 31 32 static void acpi_mp_play_dead(void) 33 { 34 play_dead_common(); 35 asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd); 36 } 37 38 static void acpi_mp_cpu_die(unsigned int cpu) 39 { 40 u32 apicid = per_cpu(x86_cpu_to_apicid, cpu); 41 unsigned long timeout; 42 43 /* 44 * Use TEST mailbox command to prove that BIOS got control over 45 * the CPU before declaring it dead. 46 * 47 * BIOS has to clear 'command' field of the mailbox. 48 */ 49 acpi_mp_wake_mailbox->apic_id = apicid; 50 smp_store_release(&acpi_mp_wake_mailbox->command, 51 ACPI_MP_WAKE_COMMAND_TEST); 52 53 /* Don't wait longer than a second. */ 54 timeout = USEC_PER_SEC; 55 while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout) 56 udelay(1); 57 58 if (!timeout) 59 pr_err("Failed to hand over CPU %d to BIOS\n", cpu); 60 } 61 62 /* The argument is required to match type of x86_mapping_info::alloc_pgt_page */ 63 static void __init *alloc_pgt_page(void *dummy) 64 { 65 return memblock_alloc(PAGE_SIZE, PAGE_SIZE); 66 } 67 68 static void __init free_pgt_page(void *pgt, void *dummy) 69 { 70 return memblock_free(pgt, PAGE_SIZE); 71 } 72 73 /* 74 * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at 75 * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches 76 * to the identity mapping and the function has be present at the same spot in 77 * the virtual address space before and after switching page tables. 78 */ 79 static int __init init_transition_pgtable(pgd_t *pgd) 80 { 81 pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; 82 unsigned long vaddr, paddr; 83 p4d_t *p4d; 84 pud_t *pud; 85 pmd_t *pmd; 86 pte_t *pte; 87 88 vaddr = (unsigned long)asm_acpi_mp_play_dead; 89 pgd += pgd_index(vaddr); 90 if (!pgd_present(*pgd)) { 91 p4d = (p4d_t *)alloc_pgt_page(NULL); 92 if (!p4d) 93 return -ENOMEM; 94 set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); 95 } 96 p4d = p4d_offset(pgd, vaddr); 97 if (!p4d_present(*p4d)) { 98 pud = (pud_t *)alloc_pgt_page(NULL); 99 if (!pud) 100 return -ENOMEM; 101 set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); 102 } 103 pud = pud_offset(p4d, vaddr); 104 if (!pud_present(*pud)) { 105 pmd = (pmd_t *)alloc_pgt_page(NULL); 106 if (!pmd) 107 return -ENOMEM; 108 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); 109 } 110 pmd = pmd_offset(pud, vaddr); 111 if (!pmd_present(*pmd)) { 112 pte = (pte_t *)alloc_pgt_page(NULL); 113 if (!pte) 114 return -ENOMEM; 115 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); 116 } 117 pte = pte_offset_kernel(pmd, vaddr); 118 119 paddr = __pa(vaddr); 120 set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); 121 122 return 0; 123 } 124 125 static int __init acpi_mp_setup_reset(u64 reset_vector) 126 { 127 struct x86_mapping_info info = { 128 .alloc_pgt_page = alloc_pgt_page, 129 .free_pgt_page = free_pgt_page, 130 .page_flag = __PAGE_KERNEL_LARGE_EXEC, 131 .kernpg_flag = _KERNPG_TABLE_NOENC, 132 }; 133 pgd_t *pgd; 134 135 pgd = alloc_pgt_page(NULL); 136 if (!pgd) 137 return -ENOMEM; 138 139 for (int i = 0; i < nr_pfn_mapped; i++) { 140 unsigned long mstart, mend; 141 142 mstart = pfn_mapped[i].start << PAGE_SHIFT; 143 mend = pfn_mapped[i].end << PAGE_SHIFT; 144 if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) { 145 kernel_ident_mapping_free(&info, pgd); 146 return -ENOMEM; 147 } 148 } 149 150 if (kernel_ident_mapping_init(&info, pgd, 151 PAGE_ALIGN_DOWN(reset_vector), 152 PAGE_ALIGN(reset_vector + 1))) { 153 kernel_ident_mapping_free(&info, pgd); 154 return -ENOMEM; 155 } 156 157 if (init_transition_pgtable(pgd)) { 158 kernel_ident_mapping_free(&info, pgd); 159 return -ENOMEM; 160 } 161 162 smp_ops.play_dead = acpi_mp_play_dead; 163 smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu; 164 smp_ops.cpu_die = acpi_mp_cpu_die; 165 166 acpi_mp_reset_vector_paddr = reset_vector; 167 acpi_mp_pgd = __pa(pgd); 168 169 return 0; 170 } 171 172 static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip) 173 { 174 if (!acpi_mp_wake_mailbox_paddr) { 175 pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n"); 176 return -EOPNOTSUPP; 177 } 178 179 /* 180 * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). 181 * 182 * Wakeup of secondary CPUs is fully serialized in the core code. 183 * No need to protect acpi_mp_wake_mailbox from concurrent accesses. 184 */ 185 if (!acpi_mp_wake_mailbox) { 186 acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr, 187 sizeof(*acpi_mp_wake_mailbox), 188 MEMREMAP_WB); 189 } 190 191 /* 192 * Mailbox memory is shared between the firmware and OS. Firmware will 193 * listen on mailbox command address, and once it receives the wakeup 194 * command, the CPU associated with the given apicid will be booted. 195 * 196 * The value of 'apic_id' and 'wakeup_vector' must be visible to the 197 * firmware before the wakeup command is visible. smp_store_release() 198 * ensures ordering and visibility. 199 */ 200 acpi_mp_wake_mailbox->apic_id = apicid; 201 acpi_mp_wake_mailbox->wakeup_vector = start_ip; 202 smp_store_release(&acpi_mp_wake_mailbox->command, 203 ACPI_MP_WAKE_COMMAND_WAKEUP); 204 205 /* 206 * Wait for the CPU to wake up. 207 * 208 * The CPU being woken up is essentially in a spin loop waiting to be 209 * woken up. It should not take long for it wake up and acknowledge by 210 * zeroing out ->command. 211 * 212 * ACPI specification doesn't provide any guidance on how long kernel 213 * has to wait for a wake up acknowledgment. It also doesn't provide 214 * a way to cancel a wake up request if it takes too long. 215 * 216 * In TDX environment, the VMM has control over how long it takes to 217 * wake up secondary. It can postpone scheduling secondary vCPU 218 * indefinitely. Giving up on wake up request and reporting error opens 219 * possible attack vector for VMM: it can wake up a secondary CPU when 220 * kernel doesn't expect it. Wait until positive result of the wake up 221 * request. 222 */ 223 while (READ_ONCE(acpi_mp_wake_mailbox->command)) 224 cpu_relax(); 225 226 return 0; 227 } 228 229 static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake) 230 { 231 cpu_hotplug_disable_offlining(); 232 233 /* 234 * ACPI MADT doesn't allow to offline a CPU after it was onlined. This 235 * limits kexec: the second kernel won't be able to use more than one CPU. 236 * 237 * To prevent a kexec kernel from onlining secondary CPUs invalidate the 238 * mailbox address in the ACPI MADT wakeup structure which prevents a 239 * kexec kernel to use it. 240 * 241 * This is safe as the booting kernel has the mailbox address cached 242 * already and acpi_wakeup_cpu() uses the cached value to bring up the 243 * secondary CPUs. 244 * 245 * Note: This is a Linux specific convention and not covered by the 246 * ACPI specification. 247 */ 248 mp_wake->mailbox_address = 0; 249 } 250 251 int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, 252 const unsigned long end) 253 { 254 struct acpi_madt_multiproc_wakeup *mp_wake; 255 256 mp_wake = (struct acpi_madt_multiproc_wakeup *)header; 257 258 /* 259 * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake 260 * entry. 'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger 261 * than the actual size of the MP wakeup entry in ACPI table because the 262 * 'reset_vector' is only available in the V1 MP wakeup structure. 263 */ 264 if (!mp_wake) 265 return -EINVAL; 266 if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0) 267 return -EINVAL; 268 if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0) 269 return -EINVAL; 270 271 acpi_table_print_madt_entry(&header->common); 272 273 acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address; 274 275 if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 && 276 mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) { 277 if (acpi_mp_setup_reset(mp_wake->reset_vector)) { 278 pr_warn("Failed to setup MADT reset vector\n"); 279 acpi_mp_disable_offlining(mp_wake); 280 } 281 } else { 282 /* 283 * CPU offlining requires version 1 of the ACPI MADT wakeup 284 * structure. 285 */ 286 acpi_mp_disable_offlining(mp_wake); 287 } 288 289 apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu); 290 291 return 0; 292 } 293