xref: /linux/kernel/kexec.c (revision d67b569f5f620c0fb95d5212642746b7ba9d29e4)
1 /*
2  * kexec.c - kexec system call
3  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
4  *
5  * This source code is licensed under the GNU General Public License,
6  * Version 2.  See the file COPYING for more details.
7  */
8 
9 #include <linux/mm.h>
10 #include <linux/file.h>
11 #include <linux/slab.h>
12 #include <linux/fs.h>
13 #include <linux/kexec.h>
14 #include <linux/spinlock.h>
15 #include <linux/list.h>
16 #include <linux/highmem.h>
17 #include <linux/syscalls.h>
18 #include <linux/reboot.h>
19 #include <linux/syscalls.h>
20 #include <linux/ioport.h>
21 #include <linux/hardirq.h>
22 
23 #include <asm/page.h>
24 #include <asm/uaccess.h>
25 #include <asm/io.h>
26 #include <asm/system.h>
27 #include <asm/semaphore.h>
28 
29 /* Location of the reserved area for the crash kernel */
30 struct resource crashk_res = {
31 	.name  = "Crash kernel",
32 	.start = 0,
33 	.end   = 0,
34 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
35 };
36 
37 int kexec_should_crash(struct task_struct *p)
38 {
39 	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
40 		return 1;
41 	return 0;
42 }
43 
44 /*
45  * When kexec transitions to the new kernel there is a one-to-one
46  * mapping between physical and virtual addresses.  On processors
47  * where you can disable the MMU this is trivial, and easy.  For
48  * others it is still a simple predictable page table to setup.
49  *
50  * In that environment kexec copies the new kernel to its final
51  * resting place.  This means I can only support memory whose
52  * physical address can fit in an unsigned long.  In particular
53  * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
54  * If the assembly stub has more restrictive requirements
55  * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
56  * defined more restrictively in <asm/kexec.h>.
57  *
58  * The code for the transition from the current kernel to the
59  * the new kernel is placed in the control_code_buffer, whose size
60  * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
61  * page of memory is necessary, but some architectures require more.
62  * Because this memory must be identity mapped in the transition from
63  * virtual to physical addresses it must live in the range
64  * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
65  * modifiable.
66  *
67  * The assembly stub in the control code buffer is passed a linked list
68  * of descriptor pages detailing the source pages of the new kernel,
69  * and the destination addresses of those source pages.  As this data
70  * structure is not used in the context of the current OS, it must
71  * be self-contained.
72  *
73  * The code has been made to work with highmem pages and will use a
74  * destination page in its final resting place (if it happens
75  * to allocate it).  The end product of this is that most of the
76  * physical address space, and most of RAM can be used.
77  *
78  * Future directions include:
79  *  - allocating a page table with the control code buffer identity
80  *    mapped, to simplify machine_kexec and make kexec_on_panic more
81  *    reliable.
82  */
83 
84 /*
85  * KIMAGE_NO_DEST is an impossible destination address..., for
86  * allocating pages whose destination address we do not care about.
87  */
88 #define KIMAGE_NO_DEST (-1UL)
89 
90 static int kimage_is_destination_range(struct kimage *image,
91 				       unsigned long start, unsigned long end);
92 static struct page *kimage_alloc_page(struct kimage *image,
93 				       unsigned int gfp_mask,
94 				       unsigned long dest);
95 
96 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
97 	                    unsigned long nr_segments,
98                             struct kexec_segment __user *segments)
99 {
100 	size_t segment_bytes;
101 	struct kimage *image;
102 	unsigned long i;
103 	int result;
104 
105 	/* Allocate a controlling structure */
106 	result = -ENOMEM;
107 	image = kmalloc(sizeof(*image), GFP_KERNEL);
108 	if (!image)
109 		goto out;
110 
111 	memset(image, 0, sizeof(*image));
112 	image->head = 0;
113 	image->entry = &image->head;
114 	image->last_entry = &image->head;
115 	image->control_page = ~0; /* By default this does not apply */
116 	image->start = entry;
117 	image->type = KEXEC_TYPE_DEFAULT;
118 
119 	/* Initialize the list of control pages */
120 	INIT_LIST_HEAD(&image->control_pages);
121 
122 	/* Initialize the list of destination pages */
123 	INIT_LIST_HEAD(&image->dest_pages);
124 
125 	/* Initialize the list of unuseable pages */
126 	INIT_LIST_HEAD(&image->unuseable_pages);
127 
128 	/* Read in the segments */
129 	image->nr_segments = nr_segments;
130 	segment_bytes = nr_segments * sizeof(*segments);
131 	result = copy_from_user(image->segment, segments, segment_bytes);
132 	if (result)
133 		goto out;
134 
135 	/*
136 	 * Verify we have good destination addresses.  The caller is
137 	 * responsible for making certain we don't attempt to load
138 	 * the new image into invalid or reserved areas of RAM.  This
139 	 * just verifies it is an address we can use.
140 	 *
141 	 * Since the kernel does everything in page size chunks ensure
142 	 * the destination addreses are page aligned.  Too many
143 	 * special cases crop of when we don't do this.  The most
144 	 * insidious is getting overlapping destination addresses
145 	 * simply because addresses are changed to page size
146 	 * granularity.
147 	 */
148 	result = -EADDRNOTAVAIL;
149 	for (i = 0; i < nr_segments; i++) {
150 		unsigned long mstart, mend;
151 
152 		mstart = image->segment[i].mem;
153 		mend   = mstart + image->segment[i].memsz;
154 		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
155 			goto out;
156 		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
157 			goto out;
158 	}
159 
160 	/* Verify our destination addresses do not overlap.
161 	 * If we alloed overlapping destination addresses
162 	 * through very weird things can happen with no
163 	 * easy explanation as one segment stops on another.
164 	 */
165 	result = -EINVAL;
166 	for (i = 0; i < nr_segments; i++) {
167 		unsigned long mstart, mend;
168 		unsigned long j;
169 
170 		mstart = image->segment[i].mem;
171 		mend   = mstart + image->segment[i].memsz;
172 		for (j = 0; j < i; j++) {
173 			unsigned long pstart, pend;
174 			pstart = image->segment[j].mem;
175 			pend   = pstart + image->segment[j].memsz;
176 			/* Do the segments overlap ? */
177 			if ((mend > pstart) && (mstart < pend))
178 				goto out;
179 		}
180 	}
181 
182 	/* Ensure our buffer sizes are strictly less than
183 	 * our memory sizes.  This should always be the case,
184 	 * and it is easier to check up front than to be surprised
185 	 * later on.
186 	 */
187 	result = -EINVAL;
188 	for (i = 0; i < nr_segments; i++) {
189 		if (image->segment[i].bufsz > image->segment[i].memsz)
190 			goto out;
191 	}
192 
193 	result = 0;
194 out:
195 	if (result == 0)
196 		*rimage = image;
197 	else
198 		kfree(image);
199 
200 	return result;
201 
202 }
203 
204 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
205 				unsigned long nr_segments,
206 				struct kexec_segment __user *segments)
207 {
208 	int result;
209 	struct kimage *image;
210 
211 	/* Allocate and initialize a controlling structure */
212 	image = NULL;
213 	result = do_kimage_alloc(&image, entry, nr_segments, segments);
214 	if (result)
215 		goto out;
216 
217 	*rimage = image;
218 
219 	/*
220 	 * Find a location for the control code buffer, and add it
221 	 * the vector of segments so that it's pages will also be
222 	 * counted as destination pages.
223 	 */
224 	result = -ENOMEM;
225 	image->control_code_page = kimage_alloc_control_pages(image,
226 					   get_order(KEXEC_CONTROL_CODE_SIZE));
227 	if (!image->control_code_page) {
228 		printk(KERN_ERR "Could not allocate control_code_buffer\n");
229 		goto out;
230 	}
231 
232 	result = 0;
233  out:
234 	if (result == 0)
235 		*rimage = image;
236 	else
237 		kfree(image);
238 
239 	return result;
240 }
241 
242 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
243 				unsigned long nr_segments,
244 				struct kexec_segment __user *segments)
245 {
246 	int result;
247 	struct kimage *image;
248 	unsigned long i;
249 
250 	image = NULL;
251 	/* Verify we have a valid entry point */
252 	if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
253 		result = -EADDRNOTAVAIL;
254 		goto out;
255 	}
256 
257 	/* Allocate and initialize a controlling structure */
258 	result = do_kimage_alloc(&image, entry, nr_segments, segments);
259 	if (result)
260 		goto out;
261 
262 	/* Enable the special crash kernel control page
263 	 * allocation policy.
264 	 */
265 	image->control_page = crashk_res.start;
266 	image->type = KEXEC_TYPE_CRASH;
267 
268 	/*
269 	 * Verify we have good destination addresses.  Normally
270 	 * the caller is responsible for making certain we don't
271 	 * attempt to load the new image into invalid or reserved
272 	 * areas of RAM.  But crash kernels are preloaded into a
273 	 * reserved area of ram.  We must ensure the addresses
274 	 * are in the reserved area otherwise preloading the
275 	 * kernel could corrupt things.
276 	 */
277 	result = -EADDRNOTAVAIL;
278 	for (i = 0; i < nr_segments; i++) {
279 		unsigned long mstart, mend;
280 
281 		mstart = image->segment[i].mem;
282 		mend = mstart + image->segment[i].memsz - 1;
283 		/* Ensure we are within the crash kernel limits */
284 		if ((mstart < crashk_res.start) || (mend > crashk_res.end))
285 			goto out;
286 	}
287 
288 	/*
289 	 * Find a location for the control code buffer, and add
290 	 * the vector of segments so that it's pages will also be
291 	 * counted as destination pages.
292 	 */
293 	result = -ENOMEM;
294 	image->control_code_page = kimage_alloc_control_pages(image,
295 					   get_order(KEXEC_CONTROL_CODE_SIZE));
296 	if (!image->control_code_page) {
297 		printk(KERN_ERR "Could not allocate control_code_buffer\n");
298 		goto out;
299 	}
300 
301 	result = 0;
302 out:
303 	if (result == 0)
304 		*rimage = image;
305 	else
306 		kfree(image);
307 
308 	return result;
309 }
310 
311 static int kimage_is_destination_range(struct kimage *image,
312 					unsigned long start,
313 					unsigned long end)
314 {
315 	unsigned long i;
316 
317 	for (i = 0; i < image->nr_segments; i++) {
318 		unsigned long mstart, mend;
319 
320 		mstart = image->segment[i].mem;
321 		mend = mstart + image->segment[i].memsz;
322 		if ((end > mstart) && (start < mend))
323 			return 1;
324 	}
325 
326 	return 0;
327 }
328 
329 static struct page *kimage_alloc_pages(unsigned int gfp_mask,
330 					unsigned int order)
331 {
332 	struct page *pages;
333 
334 	pages = alloc_pages(gfp_mask, order);
335 	if (pages) {
336 		unsigned int count, i;
337 		pages->mapping = NULL;
338 		pages->private = order;
339 		count = 1 << order;
340 		for (i = 0; i < count; i++)
341 			SetPageReserved(pages + i);
342 	}
343 
344 	return pages;
345 }
346 
347 static void kimage_free_pages(struct page *page)
348 {
349 	unsigned int order, count, i;
350 
351 	order = page->private;
352 	count = 1 << order;
353 	for (i = 0; i < count; i++)
354 		ClearPageReserved(page + i);
355 	__free_pages(page, order);
356 }
357 
358 static void kimage_free_page_list(struct list_head *list)
359 {
360 	struct list_head *pos, *next;
361 
362 	list_for_each_safe(pos, next, list) {
363 		struct page *page;
364 
365 		page = list_entry(pos, struct page, lru);
366 		list_del(&page->lru);
367 		kimage_free_pages(page);
368 	}
369 }
370 
371 static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
372 							unsigned int order)
373 {
374 	/* Control pages are special, they are the intermediaries
375 	 * that are needed while we copy the rest of the pages
376 	 * to their final resting place.  As such they must
377 	 * not conflict with either the destination addresses
378 	 * or memory the kernel is already using.
379 	 *
380 	 * The only case where we really need more than one of
381 	 * these are for architectures where we cannot disable
382 	 * the MMU and must instead generate an identity mapped
383 	 * page table for all of the memory.
384 	 *
385 	 * At worst this runs in O(N) of the image size.
386 	 */
387 	struct list_head extra_pages;
388 	struct page *pages;
389 	unsigned int count;
390 
391 	count = 1 << order;
392 	INIT_LIST_HEAD(&extra_pages);
393 
394 	/* Loop while I can allocate a page and the page allocated
395 	 * is a destination page.
396 	 */
397 	do {
398 		unsigned long pfn, epfn, addr, eaddr;
399 
400 		pages = kimage_alloc_pages(GFP_KERNEL, order);
401 		if (!pages)
402 			break;
403 		pfn   = page_to_pfn(pages);
404 		epfn  = pfn + count;
405 		addr  = pfn << PAGE_SHIFT;
406 		eaddr = epfn << PAGE_SHIFT;
407 		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
408 			      kimage_is_destination_range(image, addr, eaddr)) {
409 			list_add(&pages->lru, &extra_pages);
410 			pages = NULL;
411 		}
412 	} while (!pages);
413 
414 	if (pages) {
415 		/* Remember the allocated page... */
416 		list_add(&pages->lru, &image->control_pages);
417 
418 		/* Because the page is already in it's destination
419 		 * location we will never allocate another page at
420 		 * that address.  Therefore kimage_alloc_pages
421 		 * will not return it (again) and we don't need
422 		 * to give it an entry in image->segment[].
423 		 */
424 	}
425 	/* Deal with the destination pages I have inadvertently allocated.
426 	 *
427 	 * Ideally I would convert multi-page allocations into single
428 	 * page allocations, and add everyting to image->dest_pages.
429 	 *
430 	 * For now it is simpler to just free the pages.
431 	 */
432 	kimage_free_page_list(&extra_pages);
433 
434 	return pages;
435 }
436 
437 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
438 						      unsigned int order)
439 {
440 	/* Control pages are special, they are the intermediaries
441 	 * that are needed while we copy the rest of the pages
442 	 * to their final resting place.  As such they must
443 	 * not conflict with either the destination addresses
444 	 * or memory the kernel is already using.
445 	 *
446 	 * Control pages are also the only pags we must allocate
447 	 * when loading a crash kernel.  All of the other pages
448 	 * are specified by the segments and we just memcpy
449 	 * into them directly.
450 	 *
451 	 * The only case where we really need more than one of
452 	 * these are for architectures where we cannot disable
453 	 * the MMU and must instead generate an identity mapped
454 	 * page table for all of the memory.
455 	 *
456 	 * Given the low demand this implements a very simple
457 	 * allocator that finds the first hole of the appropriate
458 	 * size in the reserved memory region, and allocates all
459 	 * of the memory up to and including the hole.
460 	 */
461 	unsigned long hole_start, hole_end, size;
462 	struct page *pages;
463 
464 	pages = NULL;
465 	size = (1 << order) << PAGE_SHIFT;
466 	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
467 	hole_end   = hole_start + size - 1;
468 	while (hole_end <= crashk_res.end) {
469 		unsigned long i;
470 
471 		if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
472 			break;
473 		if (hole_end > crashk_res.end)
474 			break;
475 		/* See if I overlap any of the segments */
476 		for (i = 0; i < image->nr_segments; i++) {
477 			unsigned long mstart, mend;
478 
479 			mstart = image->segment[i].mem;
480 			mend   = mstart + image->segment[i].memsz - 1;
481 			if ((hole_end >= mstart) && (hole_start <= mend)) {
482 				/* Advance the hole to the end of the segment */
483 				hole_start = (mend + (size - 1)) & ~(size - 1);
484 				hole_end   = hole_start + size - 1;
485 				break;
486 			}
487 		}
488 		/* If I don't overlap any segments I have found my hole! */
489 		if (i == image->nr_segments) {
490 			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
491 			break;
492 		}
493 	}
494 	if (pages)
495 		image->control_page = hole_end;
496 
497 	return pages;
498 }
499 
500 
501 struct page *kimage_alloc_control_pages(struct kimage *image,
502 					 unsigned int order)
503 {
504 	struct page *pages = NULL;
505 
506 	switch (image->type) {
507 	case KEXEC_TYPE_DEFAULT:
508 		pages = kimage_alloc_normal_control_pages(image, order);
509 		break;
510 	case KEXEC_TYPE_CRASH:
511 		pages = kimage_alloc_crash_control_pages(image, order);
512 		break;
513 	}
514 
515 	return pages;
516 }
517 
518 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
519 {
520 	if (*image->entry != 0)
521 		image->entry++;
522 
523 	if (image->entry == image->last_entry) {
524 		kimage_entry_t *ind_page;
525 		struct page *page;
526 
527 		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
528 		if (!page)
529 			return -ENOMEM;
530 
531 		ind_page = page_address(page);
532 		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
533 		image->entry = ind_page;
534 		image->last_entry = ind_page +
535 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
536 	}
537 	*image->entry = entry;
538 	image->entry++;
539 	*image->entry = 0;
540 
541 	return 0;
542 }
543 
544 static int kimage_set_destination(struct kimage *image,
545 				   unsigned long destination)
546 {
547 	int result;
548 
549 	destination &= PAGE_MASK;
550 	result = kimage_add_entry(image, destination | IND_DESTINATION);
551 	if (result == 0)
552 		image->destination = destination;
553 
554 	return result;
555 }
556 
557 
558 static int kimage_add_page(struct kimage *image, unsigned long page)
559 {
560 	int result;
561 
562 	page &= PAGE_MASK;
563 	result = kimage_add_entry(image, page | IND_SOURCE);
564 	if (result == 0)
565 		image->destination += PAGE_SIZE;
566 
567 	return result;
568 }
569 
570 
571 static void kimage_free_extra_pages(struct kimage *image)
572 {
573 	/* Walk through and free any extra destination pages I may have */
574 	kimage_free_page_list(&image->dest_pages);
575 
576 	/* Walk through and free any unuseable pages I have cached */
577 	kimage_free_page_list(&image->unuseable_pages);
578 
579 }
580 static int kimage_terminate(struct kimage *image)
581 {
582 	if (*image->entry != 0)
583 		image->entry++;
584 
585 	*image->entry = IND_DONE;
586 
587 	return 0;
588 }
589 
590 #define for_each_kimage_entry(image, ptr, entry) \
591 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
592 		ptr = (entry & IND_INDIRECTION)? \
593 			phys_to_virt((entry & PAGE_MASK)): ptr +1)
594 
595 static void kimage_free_entry(kimage_entry_t entry)
596 {
597 	struct page *page;
598 
599 	page = pfn_to_page(entry >> PAGE_SHIFT);
600 	kimage_free_pages(page);
601 }
602 
603 static void kimage_free(struct kimage *image)
604 {
605 	kimage_entry_t *ptr, entry;
606 	kimage_entry_t ind = 0;
607 
608 	if (!image)
609 		return;
610 
611 	kimage_free_extra_pages(image);
612 	for_each_kimage_entry(image, ptr, entry) {
613 		if (entry & IND_INDIRECTION) {
614 			/* Free the previous indirection page */
615 			if (ind & IND_INDIRECTION)
616 				kimage_free_entry(ind);
617 			/* Save this indirection page until we are
618 			 * done with it.
619 			 */
620 			ind = entry;
621 		}
622 		else if (entry & IND_SOURCE)
623 			kimage_free_entry(entry);
624 	}
625 	/* Free the final indirection page */
626 	if (ind & IND_INDIRECTION)
627 		kimage_free_entry(ind);
628 
629 	/* Handle any machine specific cleanup */
630 	machine_kexec_cleanup(image);
631 
632 	/* Free the kexec control pages... */
633 	kimage_free_page_list(&image->control_pages);
634 	kfree(image);
635 }
636 
637 static kimage_entry_t *kimage_dst_used(struct kimage *image,
638 					unsigned long page)
639 {
640 	kimage_entry_t *ptr, entry;
641 	unsigned long destination = 0;
642 
643 	for_each_kimage_entry(image, ptr, entry) {
644 		if (entry & IND_DESTINATION)
645 			destination = entry & PAGE_MASK;
646 		else if (entry & IND_SOURCE) {
647 			if (page == destination)
648 				return ptr;
649 			destination += PAGE_SIZE;
650 		}
651 	}
652 
653 	return NULL;
654 }
655 
656 static struct page *kimage_alloc_page(struct kimage *image,
657 					unsigned int gfp_mask,
658 					unsigned long destination)
659 {
660 	/*
661 	 * Here we implement safeguards to ensure that a source page
662 	 * is not copied to its destination page before the data on
663 	 * the destination page is no longer useful.
664 	 *
665 	 * To do this we maintain the invariant that a source page is
666 	 * either its own destination page, or it is not a
667 	 * destination page at all.
668 	 *
669 	 * That is slightly stronger than required, but the proof
670 	 * that no problems will not occur is trivial, and the
671 	 * implementation is simply to verify.
672 	 *
673 	 * When allocating all pages normally this algorithm will run
674 	 * in O(N) time, but in the worst case it will run in O(N^2)
675 	 * time.   If the runtime is a problem the data structures can
676 	 * be fixed.
677 	 */
678 	struct page *page;
679 	unsigned long addr;
680 
681 	/*
682 	 * Walk through the list of destination pages, and see if I
683 	 * have a match.
684 	 */
685 	list_for_each_entry(page, &image->dest_pages, lru) {
686 		addr = page_to_pfn(page) << PAGE_SHIFT;
687 		if (addr == destination) {
688 			list_del(&page->lru);
689 			return page;
690 		}
691 	}
692 	page = NULL;
693 	while (1) {
694 		kimage_entry_t *old;
695 
696 		/* Allocate a page, if we run out of memory give up */
697 		page = kimage_alloc_pages(gfp_mask, 0);
698 		if (!page)
699 			return NULL;
700 		/* If the page cannot be used file it away */
701 		if (page_to_pfn(page) >
702 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
703 			list_add(&page->lru, &image->unuseable_pages);
704 			continue;
705 		}
706 		addr = page_to_pfn(page) << PAGE_SHIFT;
707 
708 		/* If it is the destination page we want use it */
709 		if (addr == destination)
710 			break;
711 
712 		/* If the page is not a destination page use it */
713 		if (!kimage_is_destination_range(image, addr,
714 						  addr + PAGE_SIZE))
715 			break;
716 
717 		/*
718 		 * I know that the page is someones destination page.
719 		 * See if there is already a source page for this
720 		 * destination page.  And if so swap the source pages.
721 		 */
722 		old = kimage_dst_used(image, addr);
723 		if (old) {
724 			/* If so move it */
725 			unsigned long old_addr;
726 			struct page *old_page;
727 
728 			old_addr = *old & PAGE_MASK;
729 			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
730 			copy_highpage(page, old_page);
731 			*old = addr | (*old & ~PAGE_MASK);
732 
733 			/* The old page I have found cannot be a
734 			 * destination page, so return it.
735 			 */
736 			addr = old_addr;
737 			page = old_page;
738 			break;
739 		}
740 		else {
741 			/* Place the page on the destination list I
742 			 * will use it later.
743 			 */
744 			list_add(&page->lru, &image->dest_pages);
745 		}
746 	}
747 
748 	return page;
749 }
750 
751 static int kimage_load_normal_segment(struct kimage *image,
752 					 struct kexec_segment *segment)
753 {
754 	unsigned long maddr;
755 	unsigned long ubytes, mbytes;
756 	int result;
757 	unsigned char __user *buf;
758 
759 	result = 0;
760 	buf = segment->buf;
761 	ubytes = segment->bufsz;
762 	mbytes = segment->memsz;
763 	maddr = segment->mem;
764 
765 	result = kimage_set_destination(image, maddr);
766 	if (result < 0)
767 		goto out;
768 
769 	while (mbytes) {
770 		struct page *page;
771 		char *ptr;
772 		size_t uchunk, mchunk;
773 
774 		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
775 		if (page == 0) {
776 			result  = -ENOMEM;
777 			goto out;
778 		}
779 		result = kimage_add_page(image, page_to_pfn(page)
780 								<< PAGE_SHIFT);
781 		if (result < 0)
782 			goto out;
783 
784 		ptr = kmap(page);
785 		/* Start with a clear page */
786 		memset(ptr, 0, PAGE_SIZE);
787 		ptr += maddr & ~PAGE_MASK;
788 		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
789 		if (mchunk > mbytes)
790 			mchunk = mbytes;
791 
792 		uchunk = mchunk;
793 		if (uchunk > ubytes)
794 			uchunk = ubytes;
795 
796 		result = copy_from_user(ptr, buf, uchunk);
797 		kunmap(page);
798 		if (result) {
799 			result = (result < 0) ? result : -EIO;
800 			goto out;
801 		}
802 		ubytes -= uchunk;
803 		maddr  += mchunk;
804 		buf    += mchunk;
805 		mbytes -= mchunk;
806 	}
807 out:
808 	return result;
809 }
810 
811 static int kimage_load_crash_segment(struct kimage *image,
812 					struct kexec_segment *segment)
813 {
814 	/* For crash dumps kernels we simply copy the data from
815 	 * user space to it's destination.
816 	 * We do things a page at a time for the sake of kmap.
817 	 */
818 	unsigned long maddr;
819 	unsigned long ubytes, mbytes;
820 	int result;
821 	unsigned char __user *buf;
822 
823 	result = 0;
824 	buf = segment->buf;
825 	ubytes = segment->bufsz;
826 	mbytes = segment->memsz;
827 	maddr = segment->mem;
828 	while (mbytes) {
829 		struct page *page;
830 		char *ptr;
831 		size_t uchunk, mchunk;
832 
833 		page = pfn_to_page(maddr >> PAGE_SHIFT);
834 		if (page == 0) {
835 			result  = -ENOMEM;
836 			goto out;
837 		}
838 		ptr = kmap(page);
839 		ptr += maddr & ~PAGE_MASK;
840 		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
841 		if (mchunk > mbytes)
842 			mchunk = mbytes;
843 
844 		uchunk = mchunk;
845 		if (uchunk > ubytes) {
846 			uchunk = ubytes;
847 			/* Zero the trailing part of the page */
848 			memset(ptr + uchunk, 0, mchunk - uchunk);
849 		}
850 		result = copy_from_user(ptr, buf, uchunk);
851 		kunmap(page);
852 		if (result) {
853 			result = (result < 0) ? result : -EIO;
854 			goto out;
855 		}
856 		ubytes -= uchunk;
857 		maddr  += mchunk;
858 		buf    += mchunk;
859 		mbytes -= mchunk;
860 	}
861 out:
862 	return result;
863 }
864 
865 static int kimage_load_segment(struct kimage *image,
866 				struct kexec_segment *segment)
867 {
868 	int result = -ENOMEM;
869 
870 	switch (image->type) {
871 	case KEXEC_TYPE_DEFAULT:
872 		result = kimage_load_normal_segment(image, segment);
873 		break;
874 	case KEXEC_TYPE_CRASH:
875 		result = kimage_load_crash_segment(image, segment);
876 		break;
877 	}
878 
879 	return result;
880 }
881 
882 /*
883  * Exec Kernel system call: for obvious reasons only root may call it.
884  *
885  * This call breaks up into three pieces.
886  * - A generic part which loads the new kernel from the current
887  *   address space, and very carefully places the data in the
888  *   allocated pages.
889  *
890  * - A generic part that interacts with the kernel and tells all of
891  *   the devices to shut down.  Preventing on-going dmas, and placing
892  *   the devices in a consistent state so a later kernel can
893  *   reinitialize them.
894  *
895  * - A machine specific part that includes the syscall number
896  *   and the copies the image to it's final destination.  And
897  *   jumps into the image at entry.
898  *
899  * kexec does not sync, or unmount filesystems so if you need
900  * that to happen you need to do that yourself.
901  */
902 struct kimage *kexec_image = NULL;
903 static struct kimage *kexec_crash_image = NULL;
904 /*
905  * A home grown binary mutex.
906  * Nothing can wait so this mutex is safe to use
907  * in interrupt context :)
908  */
909 static int kexec_lock = 0;
910 
911 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
912 				struct kexec_segment __user *segments,
913 				unsigned long flags)
914 {
915 	struct kimage **dest_image, *image;
916 	int locked;
917 	int result;
918 
919 	/* We only trust the superuser with rebooting the system. */
920 	if (!capable(CAP_SYS_BOOT))
921 		return -EPERM;
922 
923 	/*
924 	 * Verify we have a legal set of flags
925 	 * This leaves us room for future extensions.
926 	 */
927 	if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
928 		return -EINVAL;
929 
930 	/* Verify we are on the appropriate architecture */
931 	if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
932 		((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
933 		return -EINVAL;
934 
935 	/* Put an artificial cap on the number
936 	 * of segments passed to kexec_load.
937 	 */
938 	if (nr_segments > KEXEC_SEGMENT_MAX)
939 		return -EINVAL;
940 
941 	image = NULL;
942 	result = 0;
943 
944 	/* Because we write directly to the reserved memory
945 	 * region when loading crash kernels we need a mutex here to
946 	 * prevent multiple crash  kernels from attempting to load
947 	 * simultaneously, and to prevent a crash kernel from loading
948 	 * over the top of a in use crash kernel.
949 	 *
950 	 * KISS: always take the mutex.
951 	 */
952 	locked = xchg(&kexec_lock, 1);
953 	if (locked)
954 		return -EBUSY;
955 
956 	dest_image = &kexec_image;
957 	if (flags & KEXEC_ON_CRASH)
958 		dest_image = &kexec_crash_image;
959 	if (nr_segments > 0) {
960 		unsigned long i;
961 
962 		/* Loading another kernel to reboot into */
963 		if ((flags & KEXEC_ON_CRASH) == 0)
964 			result = kimage_normal_alloc(&image, entry,
965 							nr_segments, segments);
966 		/* Loading another kernel to switch to if this one crashes */
967 		else if (flags & KEXEC_ON_CRASH) {
968 			/* Free any current crash dump kernel before
969 			 * we corrupt it.
970 			 */
971 			kimage_free(xchg(&kexec_crash_image, NULL));
972 			result = kimage_crash_alloc(&image, entry,
973 						     nr_segments, segments);
974 		}
975 		if (result)
976 			goto out;
977 
978 		result = machine_kexec_prepare(image);
979 		if (result)
980 			goto out;
981 
982 		for (i = 0; i < nr_segments; i++) {
983 			result = kimage_load_segment(image, &image->segment[i]);
984 			if (result)
985 				goto out;
986 		}
987 		result = kimage_terminate(image);
988 		if (result)
989 			goto out;
990 	}
991 	/* Install the new kernel, and  Uninstall the old */
992 	image = xchg(dest_image, image);
993 
994 out:
995 	xchg(&kexec_lock, 0); /* Release the mutex */
996 	kimage_free(image);
997 
998 	return result;
999 }
1000 
1001 #ifdef CONFIG_COMPAT
1002 asmlinkage long compat_sys_kexec_load(unsigned long entry,
1003 				unsigned long nr_segments,
1004 				struct compat_kexec_segment __user *segments,
1005 				unsigned long flags)
1006 {
1007 	struct compat_kexec_segment in;
1008 	struct kexec_segment out, __user *ksegments;
1009 	unsigned long i, result;
1010 
1011 	/* Don't allow clients that don't understand the native
1012 	 * architecture to do anything.
1013 	 */
1014 	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1015 		return -EINVAL;
1016 
1017 	if (nr_segments > KEXEC_SEGMENT_MAX)
1018 		return -EINVAL;
1019 
1020 	ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1021 	for (i=0; i < nr_segments; i++) {
1022 		result = copy_from_user(&in, &segments[i], sizeof(in));
1023 		if (result)
1024 			return -EFAULT;
1025 
1026 		out.buf   = compat_ptr(in.buf);
1027 		out.bufsz = in.bufsz;
1028 		out.mem   = in.mem;
1029 		out.memsz = in.memsz;
1030 
1031 		result = copy_to_user(&ksegments[i], &out, sizeof(out));
1032 		if (result)
1033 			return -EFAULT;
1034 	}
1035 
1036 	return sys_kexec_load(entry, nr_segments, ksegments, flags);
1037 }
1038 #endif
1039 
1040 void crash_kexec(struct pt_regs *regs)
1041 {
1042 	struct kimage *image;
1043 	int locked;
1044 
1045 
1046 	/* Take the kexec_lock here to prevent sys_kexec_load
1047 	 * running on one cpu from replacing the crash kernel
1048 	 * we are using after a panic on a different cpu.
1049 	 *
1050 	 * If the crash kernel was not located in a fixed area
1051 	 * of memory the xchg(&kexec_crash_image) would be
1052 	 * sufficient.  But since I reuse the memory...
1053 	 */
1054 	locked = xchg(&kexec_lock, 1);
1055 	if (!locked) {
1056 		image = xchg(&kexec_crash_image, NULL);
1057 		if (image) {
1058 			machine_crash_shutdown(regs);
1059 			machine_kexec(image);
1060 		}
1061 		xchg(&kexec_lock, 0);
1062 	}
1063 }
1064