xref: /linux/kernel/kexec_core.c (revision c745b15c1f9cea5680c2906ae868302108f8daf0)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * kexec.c - kexec system call core code.
4  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
5  */
6 
7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 
9 #include <linux/btf.h>
10 #include <linux/capability.h>
11 #include <linux/mm.h>
12 #include <linux/file.h>
13 #include <linux/slab.h>
14 #include <linux/fs.h>
15 #include <linux/kexec.h>
16 #include <linux/mutex.h>
17 #include <linux/list.h>
18 #include <linux/highmem.h>
19 #include <linux/syscalls.h>
20 #include <linux/reboot.h>
21 #include <linux/ioport.h>
22 #include <linux/hardirq.h>
23 #include <linux/elf.h>
24 #include <linux/elfcore.h>
25 #include <linux/utsname.h>
26 #include <linux/numa.h>
27 #include <linux/suspend.h>
28 #include <linux/device.h>
29 #include <linux/freezer.h>
30 #include <linux/panic_notifier.h>
31 #include <linux/pm.h>
32 #include <linux/cpu.h>
33 #include <linux/uaccess.h>
34 #include <linux/io.h>
35 #include <linux/console.h>
36 #include <linux/vmalloc.h>
37 #include <linux/swap.h>
38 #include <linux/syscore_ops.h>
39 #include <linux/compiler.h>
40 #include <linux/hugetlb.h>
41 #include <linux/objtool.h>
42 #include <linux/kmsg_dump.h>
43 
44 #include <asm/page.h>
45 #include <asm/sections.h>
46 
47 #include <crypto/hash.h>
48 #include "kexec_internal.h"
49 
50 atomic_t __kexec_lock = ATOMIC_INIT(0);
51 
52 /* Flag to indicate we are going to kexec a new kernel */
53 bool kexec_in_progress = false;
54 
55 bool kexec_file_dbg_print;
56 
57 int kexec_should_crash(struct task_struct *p)
58 {
59 	/*
60 	 * If crash_kexec_post_notifiers is enabled, don't run
61 	 * crash_kexec() here yet, which must be run after panic
62 	 * notifiers in panic().
63 	 */
64 	if (crash_kexec_post_notifiers)
65 		return 0;
66 	/*
67 	 * There are 4 panic() calls in make_task_dead() path, each of which
68 	 * corresponds to each of these 4 conditions.
69 	 */
70 	if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
71 		return 1;
72 	return 0;
73 }
74 
75 int kexec_crash_loaded(void)
76 {
77 	return !!kexec_crash_image;
78 }
79 EXPORT_SYMBOL_GPL(kexec_crash_loaded);
80 
81 /*
82  * When kexec transitions to the new kernel there is a one-to-one
83  * mapping between physical and virtual addresses.  On processors
84  * where you can disable the MMU this is trivial, and easy.  For
85  * others it is still a simple predictable page table to setup.
86  *
87  * In that environment kexec copies the new kernel to its final
88  * resting place.  This means I can only support memory whose
89  * physical address can fit in an unsigned long.  In particular
90  * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
91  * If the assembly stub has more restrictive requirements
92  * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
93  * defined more restrictively in <asm/kexec.h>.
94  *
95  * The code for the transition from the current kernel to the
96  * new kernel is placed in the control_code_buffer, whose size
97  * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
98  * page of memory is necessary, but some architectures require more.
99  * Because this memory must be identity mapped in the transition from
100  * virtual to physical addresses it must live in the range
101  * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
102  * modifiable.
103  *
104  * The assembly stub in the control code buffer is passed a linked list
105  * of descriptor pages detailing the source pages of the new kernel,
106  * and the destination addresses of those source pages.  As this data
107  * structure is not used in the context of the current OS, it must
108  * be self-contained.
109  *
110  * The code has been made to work with highmem pages and will use a
111  * destination page in its final resting place (if it happens
112  * to allocate it).  The end product of this is that most of the
113  * physical address space, and most of RAM can be used.
114  *
115  * Future directions include:
116  *  - allocating a page table with the control code buffer identity
117  *    mapped, to simplify machine_kexec and make kexec_on_panic more
118  *    reliable.
119  */
120 
121 /*
122  * KIMAGE_NO_DEST is an impossible destination address..., for
123  * allocating pages whose destination address we do not care about.
124  */
125 #define KIMAGE_NO_DEST (-1UL)
126 #define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
127 
128 static struct page *kimage_alloc_page(struct kimage *image,
129 				       gfp_t gfp_mask,
130 				       unsigned long dest);
131 
132 int sanity_check_segment_list(struct kimage *image)
133 {
134 	int i;
135 	unsigned long nr_segments = image->nr_segments;
136 	unsigned long total_pages = 0;
137 	unsigned long nr_pages = totalram_pages();
138 
139 	/*
140 	 * Verify we have good destination addresses.  The caller is
141 	 * responsible for making certain we don't attempt to load
142 	 * the new image into invalid or reserved areas of RAM.  This
143 	 * just verifies it is an address we can use.
144 	 *
145 	 * Since the kernel does everything in page size chunks ensure
146 	 * the destination addresses are page aligned.  Too many
147 	 * special cases crop of when we don't do this.  The most
148 	 * insidious is getting overlapping destination addresses
149 	 * simply because addresses are changed to page size
150 	 * granularity.
151 	 */
152 	for (i = 0; i < nr_segments; i++) {
153 		unsigned long mstart, mend;
154 
155 		mstart = image->segment[i].mem;
156 		mend   = mstart + image->segment[i].memsz;
157 		if (mstart > mend)
158 			return -EADDRNOTAVAIL;
159 		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
160 			return -EADDRNOTAVAIL;
161 		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
162 			return -EADDRNOTAVAIL;
163 	}
164 
165 	/* Verify our destination addresses do not overlap.
166 	 * If we alloed overlapping destination addresses
167 	 * through very weird things can happen with no
168 	 * easy explanation as one segment stops on another.
169 	 */
170 	for (i = 0; i < nr_segments; i++) {
171 		unsigned long mstart, mend;
172 		unsigned long j;
173 
174 		mstart = image->segment[i].mem;
175 		mend   = mstart + image->segment[i].memsz;
176 		for (j = 0; j < i; j++) {
177 			unsigned long pstart, pend;
178 
179 			pstart = image->segment[j].mem;
180 			pend   = pstart + image->segment[j].memsz;
181 			/* Do the segments overlap ? */
182 			if ((mend > pstart) && (mstart < pend))
183 				return -EINVAL;
184 		}
185 	}
186 
187 	/* Ensure our buffer sizes are strictly less than
188 	 * our memory sizes.  This should always be the case,
189 	 * and it is easier to check up front than to be surprised
190 	 * later on.
191 	 */
192 	for (i = 0; i < nr_segments; i++) {
193 		if (image->segment[i].bufsz > image->segment[i].memsz)
194 			return -EINVAL;
195 	}
196 
197 	/*
198 	 * Verify that no more than half of memory will be consumed. If the
199 	 * request from userspace is too large, a large amount of time will be
200 	 * wasted allocating pages, which can cause a soft lockup.
201 	 */
202 	for (i = 0; i < nr_segments; i++) {
203 		if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2)
204 			return -EINVAL;
205 
206 		total_pages += PAGE_COUNT(image->segment[i].memsz);
207 	}
208 
209 	if (total_pages > nr_pages / 2)
210 		return -EINVAL;
211 
212 	/*
213 	 * Verify we have good destination addresses.  Normally
214 	 * the caller is responsible for making certain we don't
215 	 * attempt to load the new image into invalid or reserved
216 	 * areas of RAM.  But crash kernels are preloaded into a
217 	 * reserved area of ram.  We must ensure the addresses
218 	 * are in the reserved area otherwise preloading the
219 	 * kernel could corrupt things.
220 	 */
221 
222 	if (image->type == KEXEC_TYPE_CRASH) {
223 		for (i = 0; i < nr_segments; i++) {
224 			unsigned long mstart, mend;
225 
226 			mstart = image->segment[i].mem;
227 			mend = mstart + image->segment[i].memsz - 1;
228 			/* Ensure we are within the crash kernel limits */
229 			if ((mstart < phys_to_boot_phys(crashk_res.start)) ||
230 			    (mend > phys_to_boot_phys(crashk_res.end)))
231 				return -EADDRNOTAVAIL;
232 		}
233 	}
234 
235 	return 0;
236 }
237 
238 struct kimage *do_kimage_alloc_init(void)
239 {
240 	struct kimage *image;
241 
242 	/* Allocate a controlling structure */
243 	image = kzalloc(sizeof(*image), GFP_KERNEL);
244 	if (!image)
245 		return NULL;
246 
247 	image->head = 0;
248 	image->entry = &image->head;
249 	image->last_entry = &image->head;
250 	image->control_page = ~0; /* By default this does not apply */
251 	image->type = KEXEC_TYPE_DEFAULT;
252 
253 	/* Initialize the list of control pages */
254 	INIT_LIST_HEAD(&image->control_pages);
255 
256 	/* Initialize the list of destination pages */
257 	INIT_LIST_HEAD(&image->dest_pages);
258 
259 	/* Initialize the list of unusable pages */
260 	INIT_LIST_HEAD(&image->unusable_pages);
261 
262 #ifdef CONFIG_CRASH_HOTPLUG
263 	image->hp_action = KEXEC_CRASH_HP_NONE;
264 	image->elfcorehdr_index = -1;
265 	image->elfcorehdr_updated = false;
266 #endif
267 
268 	return image;
269 }
270 
271 int kimage_is_destination_range(struct kimage *image,
272 					unsigned long start,
273 					unsigned long end)
274 {
275 	unsigned long i;
276 
277 	for (i = 0; i < image->nr_segments; i++) {
278 		unsigned long mstart, mend;
279 
280 		mstart = image->segment[i].mem;
281 		mend = mstart + image->segment[i].memsz - 1;
282 		if ((end >= mstart) && (start <= mend))
283 			return 1;
284 	}
285 
286 	return 0;
287 }
288 
289 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
290 {
291 	struct page *pages;
292 
293 	if (fatal_signal_pending(current))
294 		return NULL;
295 	pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
296 	if (pages) {
297 		unsigned int count, i;
298 
299 		pages->mapping = NULL;
300 		set_page_private(pages, order);
301 		count = 1 << order;
302 		for (i = 0; i < count; i++)
303 			SetPageReserved(pages + i);
304 
305 		arch_kexec_post_alloc_pages(page_address(pages), count,
306 					    gfp_mask);
307 
308 		if (gfp_mask & __GFP_ZERO)
309 			for (i = 0; i < count; i++)
310 				clear_highpage(pages + i);
311 	}
312 
313 	return pages;
314 }
315 
316 static void kimage_free_pages(struct page *page)
317 {
318 	unsigned int order, count, i;
319 
320 	order = page_private(page);
321 	count = 1 << order;
322 
323 	arch_kexec_pre_free_pages(page_address(page), count);
324 
325 	for (i = 0; i < count; i++)
326 		ClearPageReserved(page + i);
327 	__free_pages(page, order);
328 }
329 
330 void kimage_free_page_list(struct list_head *list)
331 {
332 	struct page *page, *next;
333 
334 	list_for_each_entry_safe(page, next, list, lru) {
335 		list_del(&page->lru);
336 		kimage_free_pages(page);
337 	}
338 }
339 
340 static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
341 							unsigned int order)
342 {
343 	/* Control pages are special, they are the intermediaries
344 	 * that are needed while we copy the rest of the pages
345 	 * to their final resting place.  As such they must
346 	 * not conflict with either the destination addresses
347 	 * or memory the kernel is already using.
348 	 *
349 	 * The only case where we really need more than one of
350 	 * these are for architectures where we cannot disable
351 	 * the MMU and must instead generate an identity mapped
352 	 * page table for all of the memory.
353 	 *
354 	 * At worst this runs in O(N) of the image size.
355 	 */
356 	struct list_head extra_pages;
357 	struct page *pages;
358 	unsigned int count;
359 
360 	count = 1 << order;
361 	INIT_LIST_HEAD(&extra_pages);
362 
363 	/* Loop while I can allocate a page and the page allocated
364 	 * is a destination page.
365 	 */
366 	do {
367 		unsigned long pfn, epfn, addr, eaddr;
368 
369 		pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
370 		if (!pages)
371 			break;
372 		pfn   = page_to_boot_pfn(pages);
373 		epfn  = pfn + count;
374 		addr  = pfn << PAGE_SHIFT;
375 		eaddr = (epfn << PAGE_SHIFT) - 1;
376 		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
377 			      kimage_is_destination_range(image, addr, eaddr)) {
378 			list_add(&pages->lru, &extra_pages);
379 			pages = NULL;
380 		}
381 	} while (!pages);
382 
383 	if (pages) {
384 		/* Remember the allocated page... */
385 		list_add(&pages->lru, &image->control_pages);
386 
387 		/* Because the page is already in it's destination
388 		 * location we will never allocate another page at
389 		 * that address.  Therefore kimage_alloc_pages
390 		 * will not return it (again) and we don't need
391 		 * to give it an entry in image->segment[].
392 		 */
393 	}
394 	/* Deal with the destination pages I have inadvertently allocated.
395 	 *
396 	 * Ideally I would convert multi-page allocations into single
397 	 * page allocations, and add everything to image->dest_pages.
398 	 *
399 	 * For now it is simpler to just free the pages.
400 	 */
401 	kimage_free_page_list(&extra_pages);
402 
403 	return pages;
404 }
405 
406 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
407 						      unsigned int order)
408 {
409 	/* Control pages are special, they are the intermediaries
410 	 * that are needed while we copy the rest of the pages
411 	 * to their final resting place.  As such they must
412 	 * not conflict with either the destination addresses
413 	 * or memory the kernel is already using.
414 	 *
415 	 * Control pages are also the only pags we must allocate
416 	 * when loading a crash kernel.  All of the other pages
417 	 * are specified by the segments and we just memcpy
418 	 * into them directly.
419 	 *
420 	 * The only case where we really need more than one of
421 	 * these are for architectures where we cannot disable
422 	 * the MMU and must instead generate an identity mapped
423 	 * page table for all of the memory.
424 	 *
425 	 * Given the low demand this implements a very simple
426 	 * allocator that finds the first hole of the appropriate
427 	 * size in the reserved memory region, and allocates all
428 	 * of the memory up to and including the hole.
429 	 */
430 	unsigned long hole_start, hole_end, size;
431 	struct page *pages;
432 
433 	pages = NULL;
434 	size = (1 << order) << PAGE_SHIFT;
435 	hole_start = ALIGN(image->control_page, size);
436 	hole_end   = hole_start + size - 1;
437 	while (hole_end <= crashk_res.end) {
438 		unsigned long i;
439 
440 		cond_resched();
441 
442 		if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
443 			break;
444 		/* See if I overlap any of the segments */
445 		for (i = 0; i < image->nr_segments; i++) {
446 			unsigned long mstart, mend;
447 
448 			mstart = image->segment[i].mem;
449 			mend   = mstart + image->segment[i].memsz - 1;
450 			if ((hole_end >= mstart) && (hole_start <= mend)) {
451 				/* Advance the hole to the end of the segment */
452 				hole_start = ALIGN(mend, size);
453 				hole_end   = hole_start + size - 1;
454 				break;
455 			}
456 		}
457 		/* If I don't overlap any segments I have found my hole! */
458 		if (i == image->nr_segments) {
459 			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
460 			image->control_page = hole_end + 1;
461 			break;
462 		}
463 	}
464 
465 	/* Ensure that these pages are decrypted if SME is enabled. */
466 	if (pages)
467 		arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
468 
469 	return pages;
470 }
471 
472 
473 struct page *kimage_alloc_control_pages(struct kimage *image,
474 					 unsigned int order)
475 {
476 	struct page *pages = NULL;
477 
478 	switch (image->type) {
479 	case KEXEC_TYPE_DEFAULT:
480 		pages = kimage_alloc_normal_control_pages(image, order);
481 		break;
482 	case KEXEC_TYPE_CRASH:
483 		pages = kimage_alloc_crash_control_pages(image, order);
484 		break;
485 	}
486 
487 	return pages;
488 }
489 
490 int kimage_crash_copy_vmcoreinfo(struct kimage *image)
491 {
492 	struct page *vmcoreinfo_page;
493 	void *safecopy;
494 
495 	if (image->type != KEXEC_TYPE_CRASH)
496 		return 0;
497 
498 	/*
499 	 * For kdump, allocate one vmcoreinfo safe copy from the
500 	 * crash memory. as we have arch_kexec_protect_crashkres()
501 	 * after kexec syscall, we naturally protect it from write
502 	 * (even read) access under kernel direct mapping. But on
503 	 * the other hand, we still need to operate it when crash
504 	 * happens to generate vmcoreinfo note, hereby we rely on
505 	 * vmap for this purpose.
506 	 */
507 	vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
508 	if (!vmcoreinfo_page) {
509 		pr_warn("Could not allocate vmcoreinfo buffer\n");
510 		return -ENOMEM;
511 	}
512 	safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
513 	if (!safecopy) {
514 		pr_warn("Could not vmap vmcoreinfo buffer\n");
515 		return -ENOMEM;
516 	}
517 
518 	image->vmcoreinfo_data_copy = safecopy;
519 	crash_update_vmcoreinfo_safecopy(safecopy);
520 
521 	return 0;
522 }
523 
524 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
525 {
526 	if (*image->entry != 0)
527 		image->entry++;
528 
529 	if (image->entry == image->last_entry) {
530 		kimage_entry_t *ind_page;
531 		struct page *page;
532 
533 		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
534 		if (!page)
535 			return -ENOMEM;
536 
537 		ind_page = page_address(page);
538 		*image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION;
539 		image->entry = ind_page;
540 		image->last_entry = ind_page +
541 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
542 	}
543 	*image->entry = entry;
544 	image->entry++;
545 	*image->entry = 0;
546 
547 	return 0;
548 }
549 
550 static int kimage_set_destination(struct kimage *image,
551 				   unsigned long destination)
552 {
553 	destination &= PAGE_MASK;
554 
555 	return kimage_add_entry(image, destination | IND_DESTINATION);
556 }
557 
558 
559 static int kimage_add_page(struct kimage *image, unsigned long page)
560 {
561 	page &= PAGE_MASK;
562 
563 	return kimage_add_entry(image, page | IND_SOURCE);
564 }
565 
566 
567 static void kimage_free_extra_pages(struct kimage *image)
568 {
569 	/* Walk through and free any extra destination pages I may have */
570 	kimage_free_page_list(&image->dest_pages);
571 
572 	/* Walk through and free any unusable pages I have cached */
573 	kimage_free_page_list(&image->unusable_pages);
574 
575 }
576 
577 void kimage_terminate(struct kimage *image)
578 {
579 	if (*image->entry != 0)
580 		image->entry++;
581 
582 	*image->entry = IND_DONE;
583 }
584 
585 #define for_each_kimage_entry(image, ptr, entry) \
586 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
587 		ptr = (entry & IND_INDIRECTION) ? \
588 			boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
589 
590 static void kimage_free_entry(kimage_entry_t entry)
591 {
592 	struct page *page;
593 
594 	page = boot_pfn_to_page(entry >> PAGE_SHIFT);
595 	kimage_free_pages(page);
596 }
597 
598 void kimage_free(struct kimage *image)
599 {
600 	kimage_entry_t *ptr, entry;
601 	kimage_entry_t ind = 0;
602 
603 	if (!image)
604 		return;
605 
606 	if (image->vmcoreinfo_data_copy) {
607 		crash_update_vmcoreinfo_safecopy(NULL);
608 		vunmap(image->vmcoreinfo_data_copy);
609 	}
610 
611 	kimage_free_extra_pages(image);
612 	for_each_kimage_entry(image, ptr, entry) {
613 		if (entry & IND_INDIRECTION) {
614 			/* Free the previous indirection page */
615 			if (ind & IND_INDIRECTION)
616 				kimage_free_entry(ind);
617 			/* Save this indirection page until we are
618 			 * done with it.
619 			 */
620 			ind = entry;
621 		} else if (entry & IND_SOURCE)
622 			kimage_free_entry(entry);
623 	}
624 	/* Free the final indirection page */
625 	if (ind & IND_INDIRECTION)
626 		kimage_free_entry(ind);
627 
628 	/* Handle any machine specific cleanup */
629 	machine_kexec_cleanup(image);
630 
631 	/* Free the kexec control pages... */
632 	kimage_free_page_list(&image->control_pages);
633 
634 	/*
635 	 * Free up any temporary buffers allocated. This might hit if
636 	 * error occurred much later after buffer allocation.
637 	 */
638 	if (image->file_mode)
639 		kimage_file_post_load_cleanup(image);
640 
641 	kfree(image);
642 }
643 
644 static kimage_entry_t *kimage_dst_used(struct kimage *image,
645 					unsigned long page)
646 {
647 	kimage_entry_t *ptr, entry;
648 	unsigned long destination = 0;
649 
650 	for_each_kimage_entry(image, ptr, entry) {
651 		if (entry & IND_DESTINATION)
652 			destination = entry & PAGE_MASK;
653 		else if (entry & IND_SOURCE) {
654 			if (page == destination)
655 				return ptr;
656 			destination += PAGE_SIZE;
657 		}
658 	}
659 
660 	return NULL;
661 }
662 
663 static struct page *kimage_alloc_page(struct kimage *image,
664 					gfp_t gfp_mask,
665 					unsigned long destination)
666 {
667 	/*
668 	 * Here we implement safeguards to ensure that a source page
669 	 * is not copied to its destination page before the data on
670 	 * the destination page is no longer useful.
671 	 *
672 	 * To do this we maintain the invariant that a source page is
673 	 * either its own destination page, or it is not a
674 	 * destination page at all.
675 	 *
676 	 * That is slightly stronger than required, but the proof
677 	 * that no problems will not occur is trivial, and the
678 	 * implementation is simply to verify.
679 	 *
680 	 * When allocating all pages normally this algorithm will run
681 	 * in O(N) time, but in the worst case it will run in O(N^2)
682 	 * time.   If the runtime is a problem the data structures can
683 	 * be fixed.
684 	 */
685 	struct page *page;
686 	unsigned long addr;
687 
688 	/*
689 	 * Walk through the list of destination pages, and see if I
690 	 * have a match.
691 	 */
692 	list_for_each_entry(page, &image->dest_pages, lru) {
693 		addr = page_to_boot_pfn(page) << PAGE_SHIFT;
694 		if (addr == destination) {
695 			list_del(&page->lru);
696 			return page;
697 		}
698 	}
699 	page = NULL;
700 	while (1) {
701 		kimage_entry_t *old;
702 
703 		/* Allocate a page, if we run out of memory give up */
704 		page = kimage_alloc_pages(gfp_mask, 0);
705 		if (!page)
706 			return NULL;
707 		/* If the page cannot be used file it away */
708 		if (page_to_boot_pfn(page) >
709 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
710 			list_add(&page->lru, &image->unusable_pages);
711 			continue;
712 		}
713 		addr = page_to_boot_pfn(page) << PAGE_SHIFT;
714 
715 		/* If it is the destination page we want use it */
716 		if (addr == destination)
717 			break;
718 
719 		/* If the page is not a destination page use it */
720 		if (!kimage_is_destination_range(image, addr,
721 						  addr + PAGE_SIZE - 1))
722 			break;
723 
724 		/*
725 		 * I know that the page is someones destination page.
726 		 * See if there is already a source page for this
727 		 * destination page.  And if so swap the source pages.
728 		 */
729 		old = kimage_dst_used(image, addr);
730 		if (old) {
731 			/* If so move it */
732 			unsigned long old_addr;
733 			struct page *old_page;
734 
735 			old_addr = *old & PAGE_MASK;
736 			old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT);
737 			copy_highpage(page, old_page);
738 			*old = addr | (*old & ~PAGE_MASK);
739 
740 			/* The old page I have found cannot be a
741 			 * destination page, so return it if it's
742 			 * gfp_flags honor the ones passed in.
743 			 */
744 			if (!(gfp_mask & __GFP_HIGHMEM) &&
745 			    PageHighMem(old_page)) {
746 				kimage_free_pages(old_page);
747 				continue;
748 			}
749 			page = old_page;
750 			break;
751 		}
752 		/* Place the page on the destination list, to be used later */
753 		list_add(&page->lru, &image->dest_pages);
754 	}
755 
756 	return page;
757 }
758 
759 static int kimage_load_normal_segment(struct kimage *image,
760 					 struct kexec_segment *segment)
761 {
762 	unsigned long maddr;
763 	size_t ubytes, mbytes;
764 	int result;
765 	unsigned char __user *buf = NULL;
766 	unsigned char *kbuf = NULL;
767 
768 	if (image->file_mode)
769 		kbuf = segment->kbuf;
770 	else
771 		buf = segment->buf;
772 	ubytes = segment->bufsz;
773 	mbytes = segment->memsz;
774 	maddr = segment->mem;
775 
776 	result = kimage_set_destination(image, maddr);
777 	if (result < 0)
778 		goto out;
779 
780 	while (mbytes) {
781 		struct page *page;
782 		char *ptr;
783 		size_t uchunk, mchunk;
784 
785 		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
786 		if (!page) {
787 			result  = -ENOMEM;
788 			goto out;
789 		}
790 		result = kimage_add_page(image, page_to_boot_pfn(page)
791 								<< PAGE_SHIFT);
792 		if (result < 0)
793 			goto out;
794 
795 		ptr = kmap_local_page(page);
796 		/* Start with a clear page */
797 		clear_page(ptr);
798 		ptr += maddr & ~PAGE_MASK;
799 		mchunk = min_t(size_t, mbytes,
800 				PAGE_SIZE - (maddr & ~PAGE_MASK));
801 		uchunk = min(ubytes, mchunk);
802 
803 		/* For file based kexec, source pages are in kernel memory */
804 		if (image->file_mode)
805 			memcpy(ptr, kbuf, uchunk);
806 		else
807 			result = copy_from_user(ptr, buf, uchunk);
808 		kunmap_local(ptr);
809 		if (result) {
810 			result = -EFAULT;
811 			goto out;
812 		}
813 		ubytes -= uchunk;
814 		maddr  += mchunk;
815 		if (image->file_mode)
816 			kbuf += mchunk;
817 		else
818 			buf += mchunk;
819 		mbytes -= mchunk;
820 
821 		cond_resched();
822 	}
823 out:
824 	return result;
825 }
826 
827 static int kimage_load_crash_segment(struct kimage *image,
828 					struct kexec_segment *segment)
829 {
830 	/* For crash dumps kernels we simply copy the data from
831 	 * user space to it's destination.
832 	 * We do things a page at a time for the sake of kmap.
833 	 */
834 	unsigned long maddr;
835 	size_t ubytes, mbytes;
836 	int result;
837 	unsigned char __user *buf = NULL;
838 	unsigned char *kbuf = NULL;
839 
840 	result = 0;
841 	if (image->file_mode)
842 		kbuf = segment->kbuf;
843 	else
844 		buf = segment->buf;
845 	ubytes = segment->bufsz;
846 	mbytes = segment->memsz;
847 	maddr = segment->mem;
848 	while (mbytes) {
849 		struct page *page;
850 		char *ptr;
851 		size_t uchunk, mchunk;
852 
853 		page = boot_pfn_to_page(maddr >> PAGE_SHIFT);
854 		if (!page) {
855 			result  = -ENOMEM;
856 			goto out;
857 		}
858 		arch_kexec_post_alloc_pages(page_address(page), 1, 0);
859 		ptr = kmap_local_page(page);
860 		ptr += maddr & ~PAGE_MASK;
861 		mchunk = min_t(size_t, mbytes,
862 				PAGE_SIZE - (maddr & ~PAGE_MASK));
863 		uchunk = min(ubytes, mchunk);
864 		if (mchunk > uchunk) {
865 			/* Zero the trailing part of the page */
866 			memset(ptr + uchunk, 0, mchunk - uchunk);
867 		}
868 
869 		/* For file based kexec, source pages are in kernel memory */
870 		if (image->file_mode)
871 			memcpy(ptr, kbuf, uchunk);
872 		else
873 			result = copy_from_user(ptr, buf, uchunk);
874 		kexec_flush_icache_page(page);
875 		kunmap_local(ptr);
876 		arch_kexec_pre_free_pages(page_address(page), 1);
877 		if (result) {
878 			result = -EFAULT;
879 			goto out;
880 		}
881 		ubytes -= uchunk;
882 		maddr  += mchunk;
883 		if (image->file_mode)
884 			kbuf += mchunk;
885 		else
886 			buf += mchunk;
887 		mbytes -= mchunk;
888 
889 		cond_resched();
890 	}
891 out:
892 	return result;
893 }
894 
895 int kimage_load_segment(struct kimage *image,
896 				struct kexec_segment *segment)
897 {
898 	int result = -ENOMEM;
899 
900 	switch (image->type) {
901 	case KEXEC_TYPE_DEFAULT:
902 		result = kimage_load_normal_segment(image, segment);
903 		break;
904 	case KEXEC_TYPE_CRASH:
905 		result = kimage_load_crash_segment(image, segment);
906 		break;
907 	}
908 
909 	return result;
910 }
911 
912 struct kexec_load_limit {
913 	/* Mutex protects the limit count. */
914 	struct mutex mutex;
915 	int limit;
916 };
917 
918 static struct kexec_load_limit load_limit_reboot = {
919 	.mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex),
920 	.limit = -1,
921 };
922 
923 static struct kexec_load_limit load_limit_panic = {
924 	.mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex),
925 	.limit = -1,
926 };
927 
928 struct kimage *kexec_image;
929 struct kimage *kexec_crash_image;
930 static int kexec_load_disabled;
931 
932 #ifdef CONFIG_SYSCTL
933 static int kexec_limit_handler(struct ctl_table *table, int write,
934 			       void *buffer, size_t *lenp, loff_t *ppos)
935 {
936 	struct kexec_load_limit *limit = table->data;
937 	int val;
938 	struct ctl_table tmp = {
939 		.data = &val,
940 		.maxlen = sizeof(val),
941 		.mode = table->mode,
942 	};
943 	int ret;
944 
945 	if (write) {
946 		ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
947 		if (ret)
948 			return ret;
949 
950 		if (val < 0)
951 			return -EINVAL;
952 
953 		mutex_lock(&limit->mutex);
954 		if (limit->limit != -1 && val >= limit->limit)
955 			ret = -EINVAL;
956 		else
957 			limit->limit = val;
958 		mutex_unlock(&limit->mutex);
959 
960 		return ret;
961 	}
962 
963 	mutex_lock(&limit->mutex);
964 	val = limit->limit;
965 	mutex_unlock(&limit->mutex);
966 
967 	return proc_dointvec(&tmp, write, buffer, lenp, ppos);
968 }
969 
970 static struct ctl_table kexec_core_sysctls[] = {
971 	{
972 		.procname	= "kexec_load_disabled",
973 		.data		= &kexec_load_disabled,
974 		.maxlen		= sizeof(int),
975 		.mode		= 0644,
976 		/* only handle a transition from default "0" to "1" */
977 		.proc_handler	= proc_dointvec_minmax,
978 		.extra1		= SYSCTL_ONE,
979 		.extra2		= SYSCTL_ONE,
980 	},
981 	{
982 		.procname	= "kexec_load_limit_panic",
983 		.data		= &load_limit_panic,
984 		.mode		= 0644,
985 		.proc_handler	= kexec_limit_handler,
986 	},
987 	{
988 		.procname	= "kexec_load_limit_reboot",
989 		.data		= &load_limit_reboot,
990 		.mode		= 0644,
991 		.proc_handler	= kexec_limit_handler,
992 	},
993 	{ }
994 };
995 
996 static int __init kexec_core_sysctl_init(void)
997 {
998 	register_sysctl_init("kernel", kexec_core_sysctls);
999 	return 0;
1000 }
1001 late_initcall(kexec_core_sysctl_init);
1002 #endif
1003 
1004 bool kexec_load_permitted(int kexec_image_type)
1005 {
1006 	struct kexec_load_limit *limit;
1007 
1008 	/*
1009 	 * Only the superuser can use the kexec syscall and if it has not
1010 	 * been disabled.
1011 	 */
1012 	if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
1013 		return false;
1014 
1015 	/* Check limit counter and decrease it.*/
1016 	limit = (kexec_image_type == KEXEC_TYPE_CRASH) ?
1017 		&load_limit_panic : &load_limit_reboot;
1018 	mutex_lock(&limit->mutex);
1019 	if (!limit->limit) {
1020 		mutex_unlock(&limit->mutex);
1021 		return false;
1022 	}
1023 	if (limit->limit != -1)
1024 		limit->limit--;
1025 	mutex_unlock(&limit->mutex);
1026 
1027 	return true;
1028 }
1029 
1030 /*
1031  * No panic_cpu check version of crash_kexec().  This function is called
1032  * only when panic_cpu holds the current CPU number; this is the only CPU
1033  * which processes crash_kexec routines.
1034  */
1035 void __noclone __crash_kexec(struct pt_regs *regs)
1036 {
1037 	/* Take the kexec_lock here to prevent sys_kexec_load
1038 	 * running on one cpu from replacing the crash kernel
1039 	 * we are using after a panic on a different cpu.
1040 	 *
1041 	 * If the crash kernel was not located in a fixed area
1042 	 * of memory the xchg(&kexec_crash_image) would be
1043 	 * sufficient.  But since I reuse the memory...
1044 	 */
1045 	if (kexec_trylock()) {
1046 		if (kexec_crash_image) {
1047 			struct pt_regs fixed_regs;
1048 
1049 			crash_setup_regs(&fixed_regs, regs);
1050 			crash_save_vmcoreinfo();
1051 			machine_crash_shutdown(&fixed_regs);
1052 			machine_kexec(kexec_crash_image);
1053 		}
1054 		kexec_unlock();
1055 	}
1056 }
1057 STACK_FRAME_NON_STANDARD(__crash_kexec);
1058 
1059 __bpf_kfunc void crash_kexec(struct pt_regs *regs)
1060 {
1061 	int old_cpu, this_cpu;
1062 
1063 	/*
1064 	 * Only one CPU is allowed to execute the crash_kexec() code as with
1065 	 * panic().  Otherwise parallel calls of panic() and crash_kexec()
1066 	 * may stop each other.  To exclude them, we use panic_cpu here too.
1067 	 */
1068 	old_cpu = PANIC_CPU_INVALID;
1069 	this_cpu = raw_smp_processor_id();
1070 
1071 	if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
1072 		/* This is the 1st CPU which comes here, so go ahead. */
1073 		__crash_kexec(regs);
1074 
1075 		/*
1076 		 * Reset panic_cpu to allow another panic()/crash_kexec()
1077 		 * call.
1078 		 */
1079 		atomic_set(&panic_cpu, PANIC_CPU_INVALID);
1080 	}
1081 }
1082 
1083 static inline resource_size_t crash_resource_size(const struct resource *res)
1084 {
1085 	return !res->end ? 0 : resource_size(res);
1086 }
1087 
1088 ssize_t crash_get_memory_size(void)
1089 {
1090 	ssize_t size = 0;
1091 
1092 	if (!kexec_trylock())
1093 		return -EBUSY;
1094 
1095 	size += crash_resource_size(&crashk_res);
1096 	size += crash_resource_size(&crashk_low_res);
1097 
1098 	kexec_unlock();
1099 	return size;
1100 }
1101 
1102 static int __crash_shrink_memory(struct resource *old_res,
1103 				 unsigned long new_size)
1104 {
1105 	struct resource *ram_res;
1106 
1107 	ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
1108 	if (!ram_res)
1109 		return -ENOMEM;
1110 
1111 	ram_res->start = old_res->start + new_size;
1112 	ram_res->end   = old_res->end;
1113 	ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
1114 	ram_res->name  = "System RAM";
1115 
1116 	if (!new_size) {
1117 		release_resource(old_res);
1118 		old_res->start = 0;
1119 		old_res->end   = 0;
1120 	} else {
1121 		crashk_res.end = ram_res->start - 1;
1122 	}
1123 
1124 	crash_free_reserved_phys_range(ram_res->start, ram_res->end);
1125 	insert_resource(&iomem_resource, ram_res);
1126 
1127 	return 0;
1128 }
1129 
1130 int crash_shrink_memory(unsigned long new_size)
1131 {
1132 	int ret = 0;
1133 	unsigned long old_size, low_size;
1134 
1135 	if (!kexec_trylock())
1136 		return -EBUSY;
1137 
1138 	if (kexec_crash_image) {
1139 		ret = -ENOENT;
1140 		goto unlock;
1141 	}
1142 
1143 	low_size = crash_resource_size(&crashk_low_res);
1144 	old_size = crash_resource_size(&crashk_res) + low_size;
1145 	new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN);
1146 	if (new_size >= old_size) {
1147 		ret = (new_size == old_size) ? 0 : -EINVAL;
1148 		goto unlock;
1149 	}
1150 
1151 	/*
1152 	 * (low_size > new_size) implies that low_size is greater than zero.
1153 	 * This also means that if low_size is zero, the else branch is taken.
1154 	 *
1155 	 * If low_size is greater than 0, (low_size > new_size) indicates that
1156 	 * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res
1157 	 * needs to be shrunken.
1158 	 */
1159 	if (low_size > new_size) {
1160 		ret = __crash_shrink_memory(&crashk_res, 0);
1161 		if (ret)
1162 			goto unlock;
1163 
1164 		ret = __crash_shrink_memory(&crashk_low_res, new_size);
1165 	} else {
1166 		ret = __crash_shrink_memory(&crashk_res, new_size - low_size);
1167 	}
1168 
1169 	/* Swap crashk_res and crashk_low_res if needed */
1170 	if (!crashk_res.end && crashk_low_res.end) {
1171 		crashk_res.start = crashk_low_res.start;
1172 		crashk_res.end   = crashk_low_res.end;
1173 		release_resource(&crashk_low_res);
1174 		crashk_low_res.start = 0;
1175 		crashk_low_res.end   = 0;
1176 		insert_resource(&iomem_resource, &crashk_res);
1177 	}
1178 
1179 unlock:
1180 	kexec_unlock();
1181 	return ret;
1182 }
1183 
1184 void crash_save_cpu(struct pt_regs *regs, int cpu)
1185 {
1186 	struct elf_prstatus prstatus;
1187 	u32 *buf;
1188 
1189 	if ((cpu < 0) || (cpu >= nr_cpu_ids))
1190 		return;
1191 
1192 	/* Using ELF notes here is opportunistic.
1193 	 * I need a well defined structure format
1194 	 * for the data I pass, and I need tags
1195 	 * on the data to indicate what information I have
1196 	 * squirrelled away.  ELF notes happen to provide
1197 	 * all of that, so there is no need to invent something new.
1198 	 */
1199 	buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
1200 	if (!buf)
1201 		return;
1202 	memset(&prstatus, 0, sizeof(prstatus));
1203 	prstatus.common.pr_pid = current->pid;
1204 	elf_core_copy_regs(&prstatus.pr_reg, regs);
1205 	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1206 			      &prstatus, sizeof(prstatus));
1207 	final_note(buf);
1208 }
1209 
1210 /*
1211  * Move into place and start executing a preloaded standalone
1212  * executable.  If nothing was preloaded return an error.
1213  */
1214 int kernel_kexec(void)
1215 {
1216 	int error = 0;
1217 
1218 	if (!kexec_trylock())
1219 		return -EBUSY;
1220 	if (!kexec_image) {
1221 		error = -EINVAL;
1222 		goto Unlock;
1223 	}
1224 
1225 #ifdef CONFIG_KEXEC_JUMP
1226 	if (kexec_image->preserve_context) {
1227 		pm_prepare_console();
1228 		error = freeze_processes();
1229 		if (error) {
1230 			error = -EBUSY;
1231 			goto Restore_console;
1232 		}
1233 		suspend_console();
1234 		error = dpm_suspend_start(PMSG_FREEZE);
1235 		if (error)
1236 			goto Resume_console;
1237 		/* At this point, dpm_suspend_start() has been called,
1238 		 * but *not* dpm_suspend_end(). We *must* call
1239 		 * dpm_suspend_end() now.  Otherwise, drivers for
1240 		 * some devices (e.g. interrupt controllers) become
1241 		 * desynchronized with the actual state of the
1242 		 * hardware at resume time, and evil weirdness ensues.
1243 		 */
1244 		error = dpm_suspend_end(PMSG_FREEZE);
1245 		if (error)
1246 			goto Resume_devices;
1247 		error = suspend_disable_secondary_cpus();
1248 		if (error)
1249 			goto Enable_cpus;
1250 		local_irq_disable();
1251 		error = syscore_suspend();
1252 		if (error)
1253 			goto Enable_irqs;
1254 	} else
1255 #endif
1256 	{
1257 		kexec_in_progress = true;
1258 		kernel_restart_prepare("kexec reboot");
1259 		migrate_to_reboot_cpu();
1260 		syscore_shutdown();
1261 
1262 		/*
1263 		 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
1264 		 * no further code needs to use CPU hotplug (which is true in
1265 		 * the reboot case). However, the kexec path depends on using
1266 		 * CPU hotplug again; so re-enable it here.
1267 		 */
1268 		cpu_hotplug_enable();
1269 		pr_notice("Starting new kernel\n");
1270 		machine_shutdown();
1271 	}
1272 
1273 	kmsg_dump(KMSG_DUMP_SHUTDOWN);
1274 	machine_kexec(kexec_image);
1275 
1276 #ifdef CONFIG_KEXEC_JUMP
1277 	if (kexec_image->preserve_context) {
1278 		syscore_resume();
1279  Enable_irqs:
1280 		local_irq_enable();
1281  Enable_cpus:
1282 		suspend_enable_secondary_cpus();
1283 		dpm_resume_start(PMSG_RESTORE);
1284  Resume_devices:
1285 		dpm_resume_end(PMSG_RESTORE);
1286  Resume_console:
1287 		resume_console();
1288 		thaw_processes();
1289  Restore_console:
1290 		pm_restore_console();
1291 	}
1292 #endif
1293 
1294  Unlock:
1295 	kexec_unlock();
1296 	return error;
1297 }
1298