xref: /linux/arch/x86/kernel/crash.c (revision bba2c3615bd6cfee7456d1130f2e6b01b3f4e9ba)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Architecture specific (i386/x86_64) functions for kexec based crash dumps.
4  *
5  * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
6  *
7  * Copyright (C) IBM Corporation, 2004. All rights reserved.
8  * Copyright (C) Red Hat Inc., 2014. All rights reserved.
9  * Authors:
10  *      Vivek Goyal <vgoyal@redhat.com>
11  *
12  */
13 
14 #define pr_fmt(fmt)	"kexec: " fmt
15 
16 #include <linux/types.h>
17 #include <linux/kernel.h>
18 #include <linux/smp.h>
19 #include <linux/reboot.h>
20 #include <linux/kexec.h>
21 #include <linux/delay.h>
22 #include <linux/elf.h>
23 #include <linux/elfcore.h>
24 #include <linux/export.h>
25 #include <linux/slab.h>
26 #include <linux/vmalloc.h>
27 #include <linux/memblock.h>
28 
29 #include <asm/bootparam.h>
30 #include <asm/processor.h>
31 #include <asm/hardirq.h>
32 #include <asm/nmi.h>
33 #include <asm/hw_irq.h>
34 #include <asm/apic.h>
35 #include <asm/e820/types.h>
36 #include <asm/io_apic.h>
37 #include <asm/hpet.h>
38 #include <linux/kdebug.h>
39 #include <asm/cpu.h>
40 #include <asm/reboot.h>
41 #include <asm/tdx.h>
42 #include <asm/intel_pt.h>
43 #include <asm/crash.h>
44 #include <asm/cmdline.h>
45 #include <asm/sev.h>
46 #include <asm/virt.h>
47 
48 /* Used while preparing memory map entries for second kernel */
49 struct crash_memmap_data {
50 	struct boot_params *params;
51 	/* Type of memory */
52 	unsigned int type;
53 };
54 
55 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
56 
57 static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
58 {
59 	crash_save_cpu(regs, cpu);
60 
61 	/*
62 	 * Disable Intel PT to stop its logging
63 	 */
64 	cpu_emergency_stop_pt();
65 
66 	kdump_sev_callback();
67 
68 	disable_local_APIC();
69 }
70 
71 void kdump_nmi_shootdown_cpus(void)
72 {
73 	nmi_shootdown_cpus(kdump_nmi_callback);
74 
75 	disable_local_APIC();
76 }
77 
78 /* Override the weak function in kernel/panic.c */
79 void crash_smp_send_stop(void)
80 {
81 	static int cpus_stopped;
82 
83 	if (cpus_stopped)
84 		return;
85 
86 	if (smp_ops.crash_stop_other_cpus)
87 		smp_ops.crash_stop_other_cpus();
88 	else
89 		smp_send_stop();
90 
91 	cpus_stopped = 1;
92 }
93 
94 #else
95 void crash_smp_send_stop(void)
96 {
97 	/* There are no cpus to shootdown */
98 }
99 #endif
100 
101 void native_machine_crash_shutdown(struct pt_regs *regs)
102 {
103 	/* This function is only called after the system
104 	 * has panicked or is otherwise in a critical state.
105 	 * The minimum amount of code to allow a kexec'd kernel
106 	 * to run successfully needs to happen here.
107 	 *
108 	 * In practice this means shooting down the other cpus in
109 	 * an SMP system.
110 	 */
111 	/* The kernel is broken so disable interrupts */
112 	local_irq_disable();
113 
114 	crash_smp_send_stop();
115 
116 	tdx_sys_disable();
117 	x86_virt_emergency_disable_virtualization_cpu();
118 
119 	/*
120 	 * Disable Intel PT to stop its logging
121 	 */
122 	cpu_emergency_stop_pt();
123 
124 #ifdef CONFIG_X86_IO_APIC
125 	/* Prevent crash_kexec() from deadlocking on ioapic_lock. */
126 	ioapic_zap_locks();
127 	clear_IO_APIC();
128 #endif
129 	lapic_shutdown();
130 	restore_boot_irq_mode();
131 #ifdef CONFIG_HPET_TIMER
132 	hpet_disable();
133 #endif
134 
135 	/*
136 	 * Non-crash kexec calls enc_kexec_begin() while scheduling is still
137 	 * active. This allows the callback to wait until all in-flight
138 	 * shared<->private conversions are complete. In a crash scenario,
139 	 * enc_kexec_begin() gets called after all but one CPU have been shut
140 	 * down and interrupts have been disabled. This allows the callback to
141 	 * detect a race with the conversion and report it.
142 	 */
143 	x86_platform.guest.enc_kexec_begin();
144 	x86_platform.guest.enc_kexec_finish();
145 
146 	crash_save_cpu(regs, smp_processor_id());
147 }
148 
149 #if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG)
150 static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
151 {
152 	unsigned int *nr_ranges = arg;
153 
154 	(*nr_ranges)++;
155 	return 0;
156 }
157 
158 /* Gather all the required information to prepare elf headers for ram regions */
159 static struct crash_mem *fill_up_crash_elf_data(void)
160 {
161 	unsigned int nr_ranges = 0;
162 	struct crash_mem *cmem;
163 
164 	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback);
165 	if (!nr_ranges)
166 		return NULL;
167 
168 	/*
169 	 * Exclusion of crash region, crashk_low_res and/or crashk_cma_ranges
170 	 * may cause range splits. So add extra slots here.
171 	 *
172 	 * Exclusion of low 1M may not cause another range split, because the
173 	 * range of exclude is [0, 1M] and the condition for splitting a new
174 	 * region is that the start, end parameters are both in a certain
175 	 * existing region in cmem and cannot be equal to existing region's
176 	 * start or end. Obviously, the start of [0, 1M] cannot meet this
177 	 * condition.
178 	 *
179 	 * But in order to lest the low 1M could be changed in the future,
180 	 * (e.g. [start, 1M]), add a extra slot.
181 	 */
182 	nr_ranges += 3 + crashk_cma_cnt;
183 	cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
184 	if (!cmem)
185 		return NULL;
186 
187 	cmem->max_nr_ranges = nr_ranges;
188 
189 	return cmem;
190 }
191 
192 /*
193  * Look for any unwanted ranges between mstart, mend and remove them. This
194  * might lead to split and split ranges are put in cmem->ranges[] array
195  */
196 static int elf_header_exclude_ranges(struct crash_mem *cmem)
197 {
198 	int ret = 0;
199 	int i;
200 
201 	/* Exclude the low 1M because it is always reserved */
202 	ret = crash_exclude_mem_range(cmem, 0, SZ_1M - 1);
203 	if (ret)
204 		return ret;
205 
206 	/* Exclude crashkernel region */
207 	ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
208 	if (ret)
209 		return ret;
210 
211 	if (crashk_low_res.end)
212 		ret = crash_exclude_mem_range(cmem, crashk_low_res.start,
213 					      crashk_low_res.end);
214 	if (ret)
215 		return ret;
216 
217 	for (i = 0; i < crashk_cma_cnt; ++i) {
218 		ret = crash_exclude_mem_range(cmem, crashk_cma_ranges[i].start,
219 					      crashk_cma_ranges[i].end);
220 		if (ret)
221 			return ret;
222 	}
223 
224 	return 0;
225 }
226 
227 static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
228 {
229 	struct crash_mem *cmem = arg;
230 
231 	cmem->ranges[cmem->nr_ranges].start = res->start;
232 	cmem->ranges[cmem->nr_ranges].end = res->end;
233 	cmem->nr_ranges++;
234 
235 	return 0;
236 }
237 
238 /* Prepare elf headers. Return addr and size */
239 static int prepare_elf_headers(void **addr, unsigned long *sz,
240 			       unsigned long *nr_mem_ranges)
241 {
242 	struct crash_mem *cmem;
243 	int ret;
244 
245 	cmem = fill_up_crash_elf_data();
246 	if (!cmem)
247 		return -ENOMEM;
248 
249 	ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback);
250 	if (ret)
251 		goto out;
252 
253 	/* Exclude unwanted mem ranges */
254 	ret = elf_header_exclude_ranges(cmem);
255 	if (ret)
256 		goto out;
257 
258 	/* Return the computed number of memory ranges, for hotplug usage */
259 	*nr_mem_ranges = cmem->nr_ranges;
260 
261 	/* By default prepare 64bit headers */
262 	ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
263 
264 out:
265 	vfree(cmem);
266 	return ret;
267 }
268 #endif
269 
270 #ifdef CONFIG_KEXEC_FILE
271 static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
272 {
273 	unsigned int nr_e820_entries;
274 
275 	nr_e820_entries = params->e820_entries;
276 	if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE)
277 		return 1;
278 
279 	memcpy(&params->e820_table[nr_e820_entries], entry, sizeof(struct e820_entry));
280 	params->e820_entries++;
281 	return 0;
282 }
283 
284 static int memmap_entry_callback(struct resource *res, void *arg)
285 {
286 	struct crash_memmap_data *cmd = arg;
287 	struct boot_params *params = cmd->params;
288 	struct e820_entry ei;
289 
290 	ei.addr = res->start;
291 	ei.size = resource_size(res);
292 	ei.type = cmd->type;
293 	add_e820_entry(params, &ei);
294 
295 	return 0;
296 }
297 
298 static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
299 				 unsigned long long mstart,
300 				 unsigned long long mend)
301 {
302 	unsigned long start, end;
303 	int ret;
304 
305 	cmem->ranges[0].start = mstart;
306 	cmem->ranges[0].end = mend;
307 	cmem->nr_ranges = 1;
308 
309 	/* Exclude elf header region */
310 	start = image->elf_load_addr;
311 	end = start + image->elf_headers_sz - 1;
312 	ret = crash_exclude_mem_range(cmem, start, end);
313 
314 	if (ret)
315 		return ret;
316 
317 	/* Exclude dm crypt keys region */
318 	if (image->dm_crypt_keys_addr) {
319 		start = image->dm_crypt_keys_addr;
320 		end = start + image->dm_crypt_keys_sz - 1;
321 		return crash_exclude_mem_range(cmem, start, end);
322 	}
323 
324 	return ret;
325 }
326 
327 /* Prepare memory map for crash dump kernel */
328 int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
329 {
330 	unsigned int nr_ranges = 0;
331 	int i, ret = 0;
332 	unsigned long flags;
333 	struct e820_entry ei;
334 	struct crash_memmap_data cmd;
335 	struct crash_mem *cmem;
336 
337 	/*
338 	 * In the current x86 architecture code, the elfheader is always
339 	 * allocated at crashk_res.start. But it depends on the allocation
340 	 * position of elfheader in crashk_res. To avoid potential out of
341 	 * bounds in future, add an extra slot.
342 	 *
343 	 * And using random kexec_buf for passing dm crypt keys may cause a
344 	 * range split too, add another extra slot here.
345 	 */
346 	nr_ranges = 3;
347 	cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
348 	if (!cmem)
349 		return -ENOMEM;
350 
351 	cmem->max_nr_ranges = nr_ranges;
352 
353 	memset(&cmd, 0, sizeof(struct crash_memmap_data));
354 	cmd.params = params;
355 
356 	/* Add the low 1M */
357 	cmd.type = E820_TYPE_RAM;
358 	flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
359 	walk_iomem_res_desc(IORES_DESC_NONE, flags, 0, (1<<20)-1, &cmd,
360 			    memmap_entry_callback);
361 
362 	/* Add ACPI tables */
363 	cmd.type = E820_TYPE_ACPI;
364 	flags = IORESOURCE_MEM | IORESOURCE_BUSY;
365 	walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
366 			    memmap_entry_callback);
367 
368 	/* Add ACPI Non-volatile Storage */
369 	cmd.type = E820_TYPE_NVS;
370 	walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
371 			    memmap_entry_callback);
372 
373 	/* Add e820 reserved ranges */
374 	cmd.type = E820_TYPE_RESERVED;
375 	flags = IORESOURCE_MEM;
376 	walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd,
377 			    memmap_entry_callback);
378 
379 	/* Add crashk_low_res region */
380 	if (crashk_low_res.end) {
381 		ei.addr = crashk_low_res.start;
382 		ei.size = resource_size(&crashk_low_res);
383 		ei.type = E820_TYPE_RAM;
384 		add_e820_entry(params, &ei);
385 	}
386 
387 	/* Exclude some ranges from crashk_res and add rest to memmap */
388 	ret = memmap_exclude_ranges(image, cmem, crashk_res.start, crashk_res.end);
389 	if (ret)
390 		goto out;
391 
392 	for (i = 0; i < cmem->nr_ranges; i++) {
393 		ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1;
394 
395 		/* If entry is less than a page, skip it */
396 		if (ei.size < PAGE_SIZE)
397 			continue;
398 		ei.addr = cmem->ranges[i].start;
399 		ei.type = E820_TYPE_RAM;
400 		add_e820_entry(params, &ei);
401 	}
402 
403 	for (i = 0; i < crashk_cma_cnt; ++i) {
404 		ei.addr = crashk_cma_ranges[i].start;
405 		ei.size = crashk_cma_ranges[i].end -
406 			  crashk_cma_ranges[i].start + 1;
407 		ei.type = E820_TYPE_RAM;
408 		add_e820_entry(params, &ei);
409 	}
410 
411 out:
412 	vfree(cmem);
413 	return ret;
414 }
415 
416 int crash_load_segments(struct kimage *image)
417 {
418 	int ret;
419 	unsigned long pnum = 0;
420 	struct kexec_buf kbuf = { .image = image, .buf_min = 0,
421 				  .buf_max = ULONG_MAX, .top_down = false };
422 
423 	/* Prepare elf headers and add a segment */
424 	ret = prepare_elf_headers(&kbuf.buffer, &kbuf.bufsz, &pnum);
425 	if (ret)
426 		return ret;
427 
428 	image->elf_headers	= kbuf.buffer;
429 	image->elf_headers_sz	= kbuf.bufsz;
430 	kbuf.memsz		= kbuf.bufsz;
431 
432 #ifdef CONFIG_CRASH_HOTPLUG
433 	/*
434 	 * The elfcorehdr segment size accounts for VMCOREINFO, kernel_map,
435 	 * maximum CPUs and maximum memory ranges.
436 	 */
437 	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
438 		pnum = 2 + CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES;
439 	else
440 		pnum += 2 + CONFIG_NR_CPUS_DEFAULT;
441 
442 	if (pnum < (unsigned long)PN_XNUM) {
443 		kbuf.memsz = pnum * sizeof(Elf64_Phdr);
444 		kbuf.memsz += sizeof(Elf64_Ehdr);
445 
446 		image->elfcorehdr_index = image->nr_segments;
447 
448 		/* Mark as usable to crash kernel, else crash kernel fails on boot */
449 		image->elf_headers_sz = kbuf.memsz;
450 	} else {
451 		pr_err("number of Phdrs %lu exceeds max\n", pnum);
452 	}
453 #endif
454 
455 	kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
456 	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
457 	ret = kexec_add_buffer(&kbuf);
458 	if (ret)
459 		return ret;
460 	image->elf_load_addr = kbuf.mem;
461 	kexec_dprintk("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
462 		      image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
463 
464 	return ret;
465 }
466 #endif /* CONFIG_KEXEC_FILE */
467 
468 #ifdef CONFIG_CRASH_HOTPLUG
469 
470 #undef pr_fmt
471 #define pr_fmt(fmt) "crash hp: " fmt
472 
473 int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags)
474 {
475 
476 #ifdef CONFIG_KEXEC_FILE
477 	if (image->file_mode)
478 		return 1;
479 #endif
480 	/*
481 	 * Initially, crash hotplug support for kexec_load was added
482 	 * with the KEXEC_UPDATE_ELFCOREHDR flag. Later, this
483 	 * functionality was expanded to accommodate multiple kexec
484 	 * segment updates, leading to the introduction of the
485 	 * KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag bit. Consequently,
486 	 * when the kexec tool sends either of these flags, it indicates
487 	 * that the required kexec segment (elfcorehdr) is excluded from
488 	 * the SHA calculation.
489 	 */
490 	return (kexec_flags & KEXEC_UPDATE_ELFCOREHDR ||
491 		kexec_flags & KEXEC_CRASH_HOTPLUG_SUPPORT);
492 }
493 
494 unsigned int arch_crash_get_elfcorehdr_size(void)
495 {
496 	unsigned int sz;
497 
498 	/* kernel_map, VMCOREINFO and maximum CPUs */
499 	sz = 2 + CONFIG_NR_CPUS_DEFAULT;
500 	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
501 		sz += CONFIG_CRASH_MAX_MEMORY_RANGES;
502 	sz *= sizeof(Elf64_Phdr);
503 	return sz;
504 }
505 
506 /**
507  * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
508  * @image: a pointer to kexec_crash_image
509  * @arg: struct memory_notify handler for memory hotplug case and
510  *       NULL for CPU hotplug case.
511  *
512  * Prepare the new elfcorehdr and replace the existing elfcorehdr.
513  */
514 void arch_crash_handle_hotplug_event(struct kimage *image, void *arg)
515 {
516 	void *elfbuf = NULL, *old_elfcorehdr;
517 	unsigned long nr_mem_ranges;
518 	unsigned long mem, memsz;
519 	unsigned long elfsz = 0;
520 
521 	/*
522 	 * As crash_prepare_elf64_headers() has already described all
523 	 * possible CPUs, there is no need to update the elfcorehdr
524 	 * for additional CPU changes.
525 	 */
526 	if ((image->file_mode || image->elfcorehdr_updated) &&
527 		((image->hp_action == KEXEC_CRASH_HP_ADD_CPU) ||
528 		(image->hp_action == KEXEC_CRASH_HP_REMOVE_CPU)))
529 		return;
530 
531 	/*
532 	 * Create the new elfcorehdr reflecting the changes to CPU and/or
533 	 * memory resources.
534 	 */
535 	if (prepare_elf_headers(&elfbuf, &elfsz, &nr_mem_ranges)) {
536 		pr_err("unable to create new elfcorehdr");
537 		goto out;
538 	}
539 
540 	/*
541 	 * Obtain address and size of the elfcorehdr segment, and
542 	 * check it against the new elfcorehdr buffer.
543 	 */
544 	mem = image->segment[image->elfcorehdr_index].mem;
545 	memsz = image->segment[image->elfcorehdr_index].memsz;
546 	if (elfsz > memsz) {
547 		pr_err("update elfcorehdr elfsz %lu > memsz %lu",
548 			elfsz, memsz);
549 		goto out;
550 	}
551 
552 	/*
553 	 * Copy new elfcorehdr over the old elfcorehdr at destination.
554 	 */
555 	old_elfcorehdr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT));
556 	if (!old_elfcorehdr) {
557 		pr_err("mapping elfcorehdr segment failed\n");
558 		goto out;
559 	}
560 
561 	/*
562 	 * Temporarily invalidate the crash image while the
563 	 * elfcorehdr is updated.
564 	 */
565 	xchg(&kexec_crash_image, NULL);
566 	memcpy_flushcache(old_elfcorehdr, elfbuf, elfsz);
567 	xchg(&kexec_crash_image, image);
568 	kunmap_local(old_elfcorehdr);
569 	pr_debug("updated elfcorehdr\n");
570 
571 out:
572 	vfree(elfbuf);
573 }
574 #endif
575