1 /* 2 * kexec.c - kexec system call 3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 4 * 5 * This source code is licensed under the GNU General Public License, 6 * Version 2. See the file COPYING for more details. 7 */ 8 9 #include <linux/mm.h> 10 #include <linux/file.h> 11 #include <linux/slab.h> 12 #include <linux/fs.h> 13 #include <linux/kexec.h> 14 #include <linux/spinlock.h> 15 #include <linux/list.h> 16 #include <linux/highmem.h> 17 #include <linux/syscalls.h> 18 #include <linux/reboot.h> 19 #include <linux/syscalls.h> 20 #include <linux/ioport.h> 21 #include <linux/hardirq.h> 22 23 #include <asm/page.h> 24 #include <asm/uaccess.h> 25 #include <asm/io.h> 26 #include <asm/system.h> 27 #include <asm/semaphore.h> 28 29 /* Location of the reserved area for the crash kernel */ 30 struct resource crashk_res = { 31 .name = "Crash kernel", 32 .start = 0, 33 .end = 0, 34 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 35 }; 36 37 int kexec_should_crash(struct task_struct *p) 38 { 39 if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops) 40 return 1; 41 return 0; 42 } 43 44 /* 45 * When kexec transitions to the new kernel there is a one-to-one 46 * mapping between physical and virtual addresses. On processors 47 * where you can disable the MMU this is trivial, and easy. For 48 * others it is still a simple predictable page table to setup. 49 * 50 * In that environment kexec copies the new kernel to its final 51 * resting place. This means I can only support memory whose 52 * physical address can fit in an unsigned long. In particular 53 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. 54 * If the assembly stub has more restrictive requirements 55 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be 56 * defined more restrictively in <asm/kexec.h>. 57 * 58 * The code for the transition from the current kernel to the 59 * the new kernel is placed in the control_code_buffer, whose size 60 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single 61 * page of memory is necessary, but some architectures require more. 62 * Because this memory must be identity mapped in the transition from 63 * virtual to physical addresses it must live in the range 64 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily 65 * modifiable. 66 * 67 * The assembly stub in the control code buffer is passed a linked list 68 * of descriptor pages detailing the source pages of the new kernel, 69 * and the destination addresses of those source pages. As this data 70 * structure is not used in the context of the current OS, it must 71 * be self-contained. 72 * 73 * The code has been made to work with highmem pages and will use a 74 * destination page in its final resting place (if it happens 75 * to allocate it). The end product of this is that most of the 76 * physical address space, and most of RAM can be used. 77 * 78 * Future directions include: 79 * - allocating a page table with the control code buffer identity 80 * mapped, to simplify machine_kexec and make kexec_on_panic more 81 * reliable. 82 */ 83 84 /* 85 * KIMAGE_NO_DEST is an impossible destination address..., for 86 * allocating pages whose destination address we do not care about. 87 */ 88 #define KIMAGE_NO_DEST (-1UL) 89 90 static int kimage_is_destination_range(struct kimage *image, 91 unsigned long start, unsigned long end); 92 static struct page *kimage_alloc_page(struct kimage *image, 93 unsigned int gfp_mask, 94 unsigned long dest); 95 96 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 97 unsigned long nr_segments, 98 struct kexec_segment __user *segments) 99 { 100 size_t segment_bytes; 101 struct kimage *image; 102 unsigned long i; 103 int result; 104 105 /* Allocate a controlling structure */ 106 result = -ENOMEM; 107 image = kmalloc(sizeof(*image), GFP_KERNEL); 108 if (!image) 109 goto out; 110 111 memset(image, 0, sizeof(*image)); 112 image->head = 0; 113 image->entry = &image->head; 114 image->last_entry = &image->head; 115 image->control_page = ~0; /* By default this does not apply */ 116 image->start = entry; 117 image->type = KEXEC_TYPE_DEFAULT; 118 119 /* Initialize the list of control pages */ 120 INIT_LIST_HEAD(&image->control_pages); 121 122 /* Initialize the list of destination pages */ 123 INIT_LIST_HEAD(&image->dest_pages); 124 125 /* Initialize the list of unuseable pages */ 126 INIT_LIST_HEAD(&image->unuseable_pages); 127 128 /* Read in the segments */ 129 image->nr_segments = nr_segments; 130 segment_bytes = nr_segments * sizeof(*segments); 131 result = copy_from_user(image->segment, segments, segment_bytes); 132 if (result) 133 goto out; 134 135 /* 136 * Verify we have good destination addresses. The caller is 137 * responsible for making certain we don't attempt to load 138 * the new image into invalid or reserved areas of RAM. This 139 * just verifies it is an address we can use. 140 * 141 * Since the kernel does everything in page size chunks ensure 142 * the destination addreses are page aligned. Too many 143 * special cases crop of when we don't do this. The most 144 * insidious is getting overlapping destination addresses 145 * simply because addresses are changed to page size 146 * granularity. 147 */ 148 result = -EADDRNOTAVAIL; 149 for (i = 0; i < nr_segments; i++) { 150 unsigned long mstart, mend; 151 152 mstart = image->segment[i].mem; 153 mend = mstart + image->segment[i].memsz; 154 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) 155 goto out; 156 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) 157 goto out; 158 } 159 160 /* Verify our destination addresses do not overlap. 161 * If we alloed overlapping destination addresses 162 * through very weird things can happen with no 163 * easy explanation as one segment stops on another. 164 */ 165 result = -EINVAL; 166 for (i = 0; i < nr_segments; i++) { 167 unsigned long mstart, mend; 168 unsigned long j; 169 170 mstart = image->segment[i].mem; 171 mend = mstart + image->segment[i].memsz; 172 for (j = 0; j < i; j++) { 173 unsigned long pstart, pend; 174 pstart = image->segment[j].mem; 175 pend = pstart + image->segment[j].memsz; 176 /* Do the segments overlap ? */ 177 if ((mend > pstart) && (mstart < pend)) 178 goto out; 179 } 180 } 181 182 /* Ensure our buffer sizes are strictly less than 183 * our memory sizes. This should always be the case, 184 * and it is easier to check up front than to be surprised 185 * later on. 186 */ 187 result = -EINVAL; 188 for (i = 0; i < nr_segments; i++) { 189 if (image->segment[i].bufsz > image->segment[i].memsz) 190 goto out; 191 } 192 193 result = 0; 194 out: 195 if (result == 0) 196 *rimage = image; 197 else 198 kfree(image); 199 200 return result; 201 202 } 203 204 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 205 unsigned long nr_segments, 206 struct kexec_segment __user *segments) 207 { 208 int result; 209 struct kimage *image; 210 211 /* Allocate and initialize a controlling structure */ 212 image = NULL; 213 result = do_kimage_alloc(&image, entry, nr_segments, segments); 214 if (result) 215 goto out; 216 217 *rimage = image; 218 219 /* 220 * Find a location for the control code buffer, and add it 221 * the vector of segments so that it's pages will also be 222 * counted as destination pages. 223 */ 224 result = -ENOMEM; 225 image->control_code_page = kimage_alloc_control_pages(image, 226 get_order(KEXEC_CONTROL_CODE_SIZE)); 227 if (!image->control_code_page) { 228 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 229 goto out; 230 } 231 232 result = 0; 233 out: 234 if (result == 0) 235 *rimage = image; 236 else 237 kfree(image); 238 239 return result; 240 } 241 242 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, 243 unsigned long nr_segments, 244 struct kexec_segment __user *segments) 245 { 246 int result; 247 struct kimage *image; 248 unsigned long i; 249 250 image = NULL; 251 /* Verify we have a valid entry point */ 252 if ((entry < crashk_res.start) || (entry > crashk_res.end)) { 253 result = -EADDRNOTAVAIL; 254 goto out; 255 } 256 257 /* Allocate and initialize a controlling structure */ 258 result = do_kimage_alloc(&image, entry, nr_segments, segments); 259 if (result) 260 goto out; 261 262 /* Enable the special crash kernel control page 263 * allocation policy. 264 */ 265 image->control_page = crashk_res.start; 266 image->type = KEXEC_TYPE_CRASH; 267 268 /* 269 * Verify we have good destination addresses. Normally 270 * the caller is responsible for making certain we don't 271 * attempt to load the new image into invalid or reserved 272 * areas of RAM. But crash kernels are preloaded into a 273 * reserved area of ram. We must ensure the addresses 274 * are in the reserved area otherwise preloading the 275 * kernel could corrupt things. 276 */ 277 result = -EADDRNOTAVAIL; 278 for (i = 0; i < nr_segments; i++) { 279 unsigned long mstart, mend; 280 281 mstart = image->segment[i].mem; 282 mend = mstart + image->segment[i].memsz - 1; 283 /* Ensure we are within the crash kernel limits */ 284 if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 285 goto out; 286 } 287 288 /* 289 * Find a location for the control code buffer, and add 290 * the vector of segments so that it's pages will also be 291 * counted as destination pages. 292 */ 293 result = -ENOMEM; 294 image->control_code_page = kimage_alloc_control_pages(image, 295 get_order(KEXEC_CONTROL_CODE_SIZE)); 296 if (!image->control_code_page) { 297 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 298 goto out; 299 } 300 301 result = 0; 302 out: 303 if (result == 0) 304 *rimage = image; 305 else 306 kfree(image); 307 308 return result; 309 } 310 311 static int kimage_is_destination_range(struct kimage *image, 312 unsigned long start, 313 unsigned long end) 314 { 315 unsigned long i; 316 317 for (i = 0; i < image->nr_segments; i++) { 318 unsigned long mstart, mend; 319 320 mstart = image->segment[i].mem; 321 mend = mstart + image->segment[i].memsz; 322 if ((end > mstart) && (start < mend)) 323 return 1; 324 } 325 326 return 0; 327 } 328 329 static struct page *kimage_alloc_pages(unsigned int gfp_mask, 330 unsigned int order) 331 { 332 struct page *pages; 333 334 pages = alloc_pages(gfp_mask, order); 335 if (pages) { 336 unsigned int count, i; 337 pages->mapping = NULL; 338 pages->private = order; 339 count = 1 << order; 340 for (i = 0; i < count; i++) 341 SetPageReserved(pages + i); 342 } 343 344 return pages; 345 } 346 347 static void kimage_free_pages(struct page *page) 348 { 349 unsigned int order, count, i; 350 351 order = page->private; 352 count = 1 << order; 353 for (i = 0; i < count; i++) 354 ClearPageReserved(page + i); 355 __free_pages(page, order); 356 } 357 358 static void kimage_free_page_list(struct list_head *list) 359 { 360 struct list_head *pos, *next; 361 362 list_for_each_safe(pos, next, list) { 363 struct page *page; 364 365 page = list_entry(pos, struct page, lru); 366 list_del(&page->lru); 367 kimage_free_pages(page); 368 } 369 } 370 371 static struct page *kimage_alloc_normal_control_pages(struct kimage *image, 372 unsigned int order) 373 { 374 /* Control pages are special, they are the intermediaries 375 * that are needed while we copy the rest of the pages 376 * to their final resting place. As such they must 377 * not conflict with either the destination addresses 378 * or memory the kernel is already using. 379 * 380 * The only case where we really need more than one of 381 * these are for architectures where we cannot disable 382 * the MMU and must instead generate an identity mapped 383 * page table for all of the memory. 384 * 385 * At worst this runs in O(N) of the image size. 386 */ 387 struct list_head extra_pages; 388 struct page *pages; 389 unsigned int count; 390 391 count = 1 << order; 392 INIT_LIST_HEAD(&extra_pages); 393 394 /* Loop while I can allocate a page and the page allocated 395 * is a destination page. 396 */ 397 do { 398 unsigned long pfn, epfn, addr, eaddr; 399 400 pages = kimage_alloc_pages(GFP_KERNEL, order); 401 if (!pages) 402 break; 403 pfn = page_to_pfn(pages); 404 epfn = pfn + count; 405 addr = pfn << PAGE_SHIFT; 406 eaddr = epfn << PAGE_SHIFT; 407 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || 408 kimage_is_destination_range(image, addr, eaddr)) { 409 list_add(&pages->lru, &extra_pages); 410 pages = NULL; 411 } 412 } while (!pages); 413 414 if (pages) { 415 /* Remember the allocated page... */ 416 list_add(&pages->lru, &image->control_pages); 417 418 /* Because the page is already in it's destination 419 * location we will never allocate another page at 420 * that address. Therefore kimage_alloc_pages 421 * will not return it (again) and we don't need 422 * to give it an entry in image->segment[]. 423 */ 424 } 425 /* Deal with the destination pages I have inadvertently allocated. 426 * 427 * Ideally I would convert multi-page allocations into single 428 * page allocations, and add everyting to image->dest_pages. 429 * 430 * For now it is simpler to just free the pages. 431 */ 432 kimage_free_page_list(&extra_pages); 433 434 return pages; 435 } 436 437 static struct page *kimage_alloc_crash_control_pages(struct kimage *image, 438 unsigned int order) 439 { 440 /* Control pages are special, they are the intermediaries 441 * that are needed while we copy the rest of the pages 442 * to their final resting place. As such they must 443 * not conflict with either the destination addresses 444 * or memory the kernel is already using. 445 * 446 * Control pages are also the only pags we must allocate 447 * when loading a crash kernel. All of the other pages 448 * are specified by the segments and we just memcpy 449 * into them directly. 450 * 451 * The only case where we really need more than one of 452 * these are for architectures where we cannot disable 453 * the MMU and must instead generate an identity mapped 454 * page table for all of the memory. 455 * 456 * Given the low demand this implements a very simple 457 * allocator that finds the first hole of the appropriate 458 * size in the reserved memory region, and allocates all 459 * of the memory up to and including the hole. 460 */ 461 unsigned long hole_start, hole_end, size; 462 struct page *pages; 463 464 pages = NULL; 465 size = (1 << order) << PAGE_SHIFT; 466 hole_start = (image->control_page + (size - 1)) & ~(size - 1); 467 hole_end = hole_start + size - 1; 468 while (hole_end <= crashk_res.end) { 469 unsigned long i; 470 471 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) 472 break; 473 if (hole_end > crashk_res.end) 474 break; 475 /* See if I overlap any of the segments */ 476 for (i = 0; i < image->nr_segments; i++) { 477 unsigned long mstart, mend; 478 479 mstart = image->segment[i].mem; 480 mend = mstart + image->segment[i].memsz - 1; 481 if ((hole_end >= mstart) && (hole_start <= mend)) { 482 /* Advance the hole to the end of the segment */ 483 hole_start = (mend + (size - 1)) & ~(size - 1); 484 hole_end = hole_start + size - 1; 485 break; 486 } 487 } 488 /* If I don't overlap any segments I have found my hole! */ 489 if (i == image->nr_segments) { 490 pages = pfn_to_page(hole_start >> PAGE_SHIFT); 491 break; 492 } 493 } 494 if (pages) 495 image->control_page = hole_end; 496 497 return pages; 498 } 499 500 501 struct page *kimage_alloc_control_pages(struct kimage *image, 502 unsigned int order) 503 { 504 struct page *pages = NULL; 505 506 switch (image->type) { 507 case KEXEC_TYPE_DEFAULT: 508 pages = kimage_alloc_normal_control_pages(image, order); 509 break; 510 case KEXEC_TYPE_CRASH: 511 pages = kimage_alloc_crash_control_pages(image, order); 512 break; 513 } 514 515 return pages; 516 } 517 518 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) 519 { 520 if (*image->entry != 0) 521 image->entry++; 522 523 if (image->entry == image->last_entry) { 524 kimage_entry_t *ind_page; 525 struct page *page; 526 527 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); 528 if (!page) 529 return -ENOMEM; 530 531 ind_page = page_address(page); 532 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; 533 image->entry = ind_page; 534 image->last_entry = ind_page + 535 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); 536 } 537 *image->entry = entry; 538 image->entry++; 539 *image->entry = 0; 540 541 return 0; 542 } 543 544 static int kimage_set_destination(struct kimage *image, 545 unsigned long destination) 546 { 547 int result; 548 549 destination &= PAGE_MASK; 550 result = kimage_add_entry(image, destination | IND_DESTINATION); 551 if (result == 0) 552 image->destination = destination; 553 554 return result; 555 } 556 557 558 static int kimage_add_page(struct kimage *image, unsigned long page) 559 { 560 int result; 561 562 page &= PAGE_MASK; 563 result = kimage_add_entry(image, page | IND_SOURCE); 564 if (result == 0) 565 image->destination += PAGE_SIZE; 566 567 return result; 568 } 569 570 571 static void kimage_free_extra_pages(struct kimage *image) 572 { 573 /* Walk through and free any extra destination pages I may have */ 574 kimage_free_page_list(&image->dest_pages); 575 576 /* Walk through and free any unuseable pages I have cached */ 577 kimage_free_page_list(&image->unuseable_pages); 578 579 } 580 static int kimage_terminate(struct kimage *image) 581 { 582 if (*image->entry != 0) 583 image->entry++; 584 585 *image->entry = IND_DONE; 586 587 return 0; 588 } 589 590 #define for_each_kimage_entry(image, ptr, entry) \ 591 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 592 ptr = (entry & IND_INDIRECTION)? \ 593 phys_to_virt((entry & PAGE_MASK)): ptr +1) 594 595 static void kimage_free_entry(kimage_entry_t entry) 596 { 597 struct page *page; 598 599 page = pfn_to_page(entry >> PAGE_SHIFT); 600 kimage_free_pages(page); 601 } 602 603 static void kimage_free(struct kimage *image) 604 { 605 kimage_entry_t *ptr, entry; 606 kimage_entry_t ind = 0; 607 608 if (!image) 609 return; 610 611 kimage_free_extra_pages(image); 612 for_each_kimage_entry(image, ptr, entry) { 613 if (entry & IND_INDIRECTION) { 614 /* Free the previous indirection page */ 615 if (ind & IND_INDIRECTION) 616 kimage_free_entry(ind); 617 /* Save this indirection page until we are 618 * done with it. 619 */ 620 ind = entry; 621 } 622 else if (entry & IND_SOURCE) 623 kimage_free_entry(entry); 624 } 625 /* Free the final indirection page */ 626 if (ind & IND_INDIRECTION) 627 kimage_free_entry(ind); 628 629 /* Handle any machine specific cleanup */ 630 machine_kexec_cleanup(image); 631 632 /* Free the kexec control pages... */ 633 kimage_free_page_list(&image->control_pages); 634 kfree(image); 635 } 636 637 static kimage_entry_t *kimage_dst_used(struct kimage *image, 638 unsigned long page) 639 { 640 kimage_entry_t *ptr, entry; 641 unsigned long destination = 0; 642 643 for_each_kimage_entry(image, ptr, entry) { 644 if (entry & IND_DESTINATION) 645 destination = entry & PAGE_MASK; 646 else if (entry & IND_SOURCE) { 647 if (page == destination) 648 return ptr; 649 destination += PAGE_SIZE; 650 } 651 } 652 653 return NULL; 654 } 655 656 static struct page *kimage_alloc_page(struct kimage *image, 657 unsigned int gfp_mask, 658 unsigned long destination) 659 { 660 /* 661 * Here we implement safeguards to ensure that a source page 662 * is not copied to its destination page before the data on 663 * the destination page is no longer useful. 664 * 665 * To do this we maintain the invariant that a source page is 666 * either its own destination page, or it is not a 667 * destination page at all. 668 * 669 * That is slightly stronger than required, but the proof 670 * that no problems will not occur is trivial, and the 671 * implementation is simply to verify. 672 * 673 * When allocating all pages normally this algorithm will run 674 * in O(N) time, but in the worst case it will run in O(N^2) 675 * time. If the runtime is a problem the data structures can 676 * be fixed. 677 */ 678 struct page *page; 679 unsigned long addr; 680 681 /* 682 * Walk through the list of destination pages, and see if I 683 * have a match. 684 */ 685 list_for_each_entry(page, &image->dest_pages, lru) { 686 addr = page_to_pfn(page) << PAGE_SHIFT; 687 if (addr == destination) { 688 list_del(&page->lru); 689 return page; 690 } 691 } 692 page = NULL; 693 while (1) { 694 kimage_entry_t *old; 695 696 /* Allocate a page, if we run out of memory give up */ 697 page = kimage_alloc_pages(gfp_mask, 0); 698 if (!page) 699 return NULL; 700 /* If the page cannot be used file it away */ 701 if (page_to_pfn(page) > 702 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { 703 list_add(&page->lru, &image->unuseable_pages); 704 continue; 705 } 706 addr = page_to_pfn(page) << PAGE_SHIFT; 707 708 /* If it is the destination page we want use it */ 709 if (addr == destination) 710 break; 711 712 /* If the page is not a destination page use it */ 713 if (!kimage_is_destination_range(image, addr, 714 addr + PAGE_SIZE)) 715 break; 716 717 /* 718 * I know that the page is someones destination page. 719 * See if there is already a source page for this 720 * destination page. And if so swap the source pages. 721 */ 722 old = kimage_dst_used(image, addr); 723 if (old) { 724 /* If so move it */ 725 unsigned long old_addr; 726 struct page *old_page; 727 728 old_addr = *old & PAGE_MASK; 729 old_page = pfn_to_page(old_addr >> PAGE_SHIFT); 730 copy_highpage(page, old_page); 731 *old = addr | (*old & ~PAGE_MASK); 732 733 /* The old page I have found cannot be a 734 * destination page, so return it. 735 */ 736 addr = old_addr; 737 page = old_page; 738 break; 739 } 740 else { 741 /* Place the page on the destination list I 742 * will use it later. 743 */ 744 list_add(&page->lru, &image->dest_pages); 745 } 746 } 747 748 return page; 749 } 750 751 static int kimage_load_normal_segment(struct kimage *image, 752 struct kexec_segment *segment) 753 { 754 unsigned long maddr; 755 unsigned long ubytes, mbytes; 756 int result; 757 unsigned char __user *buf; 758 759 result = 0; 760 buf = segment->buf; 761 ubytes = segment->bufsz; 762 mbytes = segment->memsz; 763 maddr = segment->mem; 764 765 result = kimage_set_destination(image, maddr); 766 if (result < 0) 767 goto out; 768 769 while (mbytes) { 770 struct page *page; 771 char *ptr; 772 size_t uchunk, mchunk; 773 774 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); 775 if (page == 0) { 776 result = -ENOMEM; 777 goto out; 778 } 779 result = kimage_add_page(image, page_to_pfn(page) 780 << PAGE_SHIFT); 781 if (result < 0) 782 goto out; 783 784 ptr = kmap(page); 785 /* Start with a clear page */ 786 memset(ptr, 0, PAGE_SIZE); 787 ptr += maddr & ~PAGE_MASK; 788 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 789 if (mchunk > mbytes) 790 mchunk = mbytes; 791 792 uchunk = mchunk; 793 if (uchunk > ubytes) 794 uchunk = ubytes; 795 796 result = copy_from_user(ptr, buf, uchunk); 797 kunmap(page); 798 if (result) { 799 result = (result < 0) ? result : -EIO; 800 goto out; 801 } 802 ubytes -= uchunk; 803 maddr += mchunk; 804 buf += mchunk; 805 mbytes -= mchunk; 806 } 807 out: 808 return result; 809 } 810 811 static int kimage_load_crash_segment(struct kimage *image, 812 struct kexec_segment *segment) 813 { 814 /* For crash dumps kernels we simply copy the data from 815 * user space to it's destination. 816 * We do things a page at a time for the sake of kmap. 817 */ 818 unsigned long maddr; 819 unsigned long ubytes, mbytes; 820 int result; 821 unsigned char __user *buf; 822 823 result = 0; 824 buf = segment->buf; 825 ubytes = segment->bufsz; 826 mbytes = segment->memsz; 827 maddr = segment->mem; 828 while (mbytes) { 829 struct page *page; 830 char *ptr; 831 size_t uchunk, mchunk; 832 833 page = pfn_to_page(maddr >> PAGE_SHIFT); 834 if (page == 0) { 835 result = -ENOMEM; 836 goto out; 837 } 838 ptr = kmap(page); 839 ptr += maddr & ~PAGE_MASK; 840 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 841 if (mchunk > mbytes) 842 mchunk = mbytes; 843 844 uchunk = mchunk; 845 if (uchunk > ubytes) { 846 uchunk = ubytes; 847 /* Zero the trailing part of the page */ 848 memset(ptr + uchunk, 0, mchunk - uchunk); 849 } 850 result = copy_from_user(ptr, buf, uchunk); 851 kunmap(page); 852 if (result) { 853 result = (result < 0) ? result : -EIO; 854 goto out; 855 } 856 ubytes -= uchunk; 857 maddr += mchunk; 858 buf += mchunk; 859 mbytes -= mchunk; 860 } 861 out: 862 return result; 863 } 864 865 static int kimage_load_segment(struct kimage *image, 866 struct kexec_segment *segment) 867 { 868 int result = -ENOMEM; 869 870 switch (image->type) { 871 case KEXEC_TYPE_DEFAULT: 872 result = kimage_load_normal_segment(image, segment); 873 break; 874 case KEXEC_TYPE_CRASH: 875 result = kimage_load_crash_segment(image, segment); 876 break; 877 } 878 879 return result; 880 } 881 882 /* 883 * Exec Kernel system call: for obvious reasons only root may call it. 884 * 885 * This call breaks up into three pieces. 886 * - A generic part which loads the new kernel from the current 887 * address space, and very carefully places the data in the 888 * allocated pages. 889 * 890 * - A generic part that interacts with the kernel and tells all of 891 * the devices to shut down. Preventing on-going dmas, and placing 892 * the devices in a consistent state so a later kernel can 893 * reinitialize them. 894 * 895 * - A machine specific part that includes the syscall number 896 * and the copies the image to it's final destination. And 897 * jumps into the image at entry. 898 * 899 * kexec does not sync, or unmount filesystems so if you need 900 * that to happen you need to do that yourself. 901 */ 902 struct kimage *kexec_image = NULL; 903 static struct kimage *kexec_crash_image = NULL; 904 /* 905 * A home grown binary mutex. 906 * Nothing can wait so this mutex is safe to use 907 * in interrupt context :) 908 */ 909 static int kexec_lock = 0; 910 911 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 912 struct kexec_segment __user *segments, 913 unsigned long flags) 914 { 915 struct kimage **dest_image, *image; 916 int locked; 917 int result; 918 919 /* We only trust the superuser with rebooting the system. */ 920 if (!capable(CAP_SYS_BOOT)) 921 return -EPERM; 922 923 /* 924 * Verify we have a legal set of flags 925 * This leaves us room for future extensions. 926 */ 927 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) 928 return -EINVAL; 929 930 /* Verify we are on the appropriate architecture */ 931 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && 932 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) 933 return -EINVAL; 934 935 /* Put an artificial cap on the number 936 * of segments passed to kexec_load. 937 */ 938 if (nr_segments > KEXEC_SEGMENT_MAX) 939 return -EINVAL; 940 941 image = NULL; 942 result = 0; 943 944 /* Because we write directly to the reserved memory 945 * region when loading crash kernels we need a mutex here to 946 * prevent multiple crash kernels from attempting to load 947 * simultaneously, and to prevent a crash kernel from loading 948 * over the top of a in use crash kernel. 949 * 950 * KISS: always take the mutex. 951 */ 952 locked = xchg(&kexec_lock, 1); 953 if (locked) 954 return -EBUSY; 955 956 dest_image = &kexec_image; 957 if (flags & KEXEC_ON_CRASH) 958 dest_image = &kexec_crash_image; 959 if (nr_segments > 0) { 960 unsigned long i; 961 962 /* Loading another kernel to reboot into */ 963 if ((flags & KEXEC_ON_CRASH) == 0) 964 result = kimage_normal_alloc(&image, entry, 965 nr_segments, segments); 966 /* Loading another kernel to switch to if this one crashes */ 967 else if (flags & KEXEC_ON_CRASH) { 968 /* Free any current crash dump kernel before 969 * we corrupt it. 970 */ 971 kimage_free(xchg(&kexec_crash_image, NULL)); 972 result = kimage_crash_alloc(&image, entry, 973 nr_segments, segments); 974 } 975 if (result) 976 goto out; 977 978 result = machine_kexec_prepare(image); 979 if (result) 980 goto out; 981 982 for (i = 0; i < nr_segments; i++) { 983 result = kimage_load_segment(image, &image->segment[i]); 984 if (result) 985 goto out; 986 } 987 result = kimage_terminate(image); 988 if (result) 989 goto out; 990 } 991 /* Install the new kernel, and Uninstall the old */ 992 image = xchg(dest_image, image); 993 994 out: 995 xchg(&kexec_lock, 0); /* Release the mutex */ 996 kimage_free(image); 997 998 return result; 999 } 1000 1001 #ifdef CONFIG_COMPAT 1002 asmlinkage long compat_sys_kexec_load(unsigned long entry, 1003 unsigned long nr_segments, 1004 struct compat_kexec_segment __user *segments, 1005 unsigned long flags) 1006 { 1007 struct compat_kexec_segment in; 1008 struct kexec_segment out, __user *ksegments; 1009 unsigned long i, result; 1010 1011 /* Don't allow clients that don't understand the native 1012 * architecture to do anything. 1013 */ 1014 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) 1015 return -EINVAL; 1016 1017 if (nr_segments > KEXEC_SEGMENT_MAX) 1018 return -EINVAL; 1019 1020 ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); 1021 for (i=0; i < nr_segments; i++) { 1022 result = copy_from_user(&in, &segments[i], sizeof(in)); 1023 if (result) 1024 return -EFAULT; 1025 1026 out.buf = compat_ptr(in.buf); 1027 out.bufsz = in.bufsz; 1028 out.mem = in.mem; 1029 out.memsz = in.memsz; 1030 1031 result = copy_to_user(&ksegments[i], &out, sizeof(out)); 1032 if (result) 1033 return -EFAULT; 1034 } 1035 1036 return sys_kexec_load(entry, nr_segments, ksegments, flags); 1037 } 1038 #endif 1039 1040 void crash_kexec(struct pt_regs *regs) 1041 { 1042 struct kimage *image; 1043 int locked; 1044 1045 1046 /* Take the kexec_lock here to prevent sys_kexec_load 1047 * running on one cpu from replacing the crash kernel 1048 * we are using after a panic on a different cpu. 1049 * 1050 * If the crash kernel was not located in a fixed area 1051 * of memory the xchg(&kexec_crash_image) would be 1052 * sufficient. But since I reuse the memory... 1053 */ 1054 locked = xchg(&kexec_lock, 1); 1055 if (!locked) { 1056 image = xchg(&kexec_crash_image, NULL); 1057 if (image) { 1058 machine_crash_shutdown(regs); 1059 machine_kexec(image); 1060 } 1061 xchg(&kexec_lock, 0); 1062 } 1063 } 1064