1 /* 2 * kexec.c - kexec system call 3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 4 * 5 * This source code is licensed under the GNU General Public License, 6 * Version 2. See the file COPYING for more details. 7 */ 8 9 #include <linux/capability.h> 10 #include <linux/mm.h> 11 #include <linux/file.h> 12 #include <linux/slab.h> 13 #include <linux/fs.h> 14 #include <linux/kexec.h> 15 #include <linux/mutex.h> 16 #include <linux/list.h> 17 #include <linux/highmem.h> 18 #include <linux/syscalls.h> 19 #include <linux/reboot.h> 20 #include <linux/ioport.h> 21 #include <linux/hardirq.h> 22 #include <linux/elf.h> 23 #include <linux/elfcore.h> 24 #include <linux/utsrelease.h> 25 #include <linux/utsname.h> 26 #include <linux/numa.h> 27 #include <linux/suspend.h> 28 #include <linux/device.h> 29 #include <linux/freezer.h> 30 #include <linux/pm.h> 31 #include <linux/cpu.h> 32 #include <linux/console.h> 33 34 #include <asm/page.h> 35 #include <asm/uaccess.h> 36 #include <asm/io.h> 37 #include <asm/system.h> 38 #include <asm/sections.h> 39 40 /* Per cpu memory for storing cpu states in case of system crash. */ 41 note_buf_t* crash_notes; 42 43 /* vmcoreinfo stuff */ 44 unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 45 u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; 46 size_t vmcoreinfo_size; 47 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); 48 49 /* Location of the reserved area for the crash kernel */ 50 struct resource crashk_res = { 51 .name = "Crash kernel", 52 .start = 0, 53 .end = 0, 54 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 55 }; 56 57 int kexec_should_crash(struct task_struct *p) 58 { 59 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) 60 return 1; 61 return 0; 62 } 63 64 /* 65 * When kexec transitions to the new kernel there is a one-to-one 66 * mapping between physical and virtual addresses. On processors 67 * where you can disable the MMU this is trivial, and easy. For 68 * others it is still a simple predictable page table to setup. 69 * 70 * In that environment kexec copies the new kernel to its final 71 * resting place. This means I can only support memory whose 72 * physical address can fit in an unsigned long. In particular 73 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. 74 * If the assembly stub has more restrictive requirements 75 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be 76 * defined more restrictively in <asm/kexec.h>. 77 * 78 * The code for the transition from the current kernel to the 79 * the new kernel is placed in the control_code_buffer, whose size 80 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single 81 * page of memory is necessary, but some architectures require more. 82 * Because this memory must be identity mapped in the transition from 83 * virtual to physical addresses it must live in the range 84 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily 85 * modifiable. 86 * 87 * The assembly stub in the control code buffer is passed a linked list 88 * of descriptor pages detailing the source pages of the new kernel, 89 * and the destination addresses of those source pages. As this data 90 * structure is not used in the context of the current OS, it must 91 * be self-contained. 92 * 93 * The code has been made to work with highmem pages and will use a 94 * destination page in its final resting place (if it happens 95 * to allocate it). The end product of this is that most of the 96 * physical address space, and most of RAM can be used. 97 * 98 * Future directions include: 99 * - allocating a page table with the control code buffer identity 100 * mapped, to simplify machine_kexec and make kexec_on_panic more 101 * reliable. 102 */ 103 104 /* 105 * KIMAGE_NO_DEST is an impossible destination address..., for 106 * allocating pages whose destination address we do not care about. 107 */ 108 #define KIMAGE_NO_DEST (-1UL) 109 110 static int kimage_is_destination_range(struct kimage *image, 111 unsigned long start, unsigned long end); 112 static struct page *kimage_alloc_page(struct kimage *image, 113 gfp_t gfp_mask, 114 unsigned long dest); 115 116 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 117 unsigned long nr_segments, 118 struct kexec_segment __user *segments) 119 { 120 size_t segment_bytes; 121 struct kimage *image; 122 unsigned long i; 123 int result; 124 125 /* Allocate a controlling structure */ 126 result = -ENOMEM; 127 image = kzalloc(sizeof(*image), GFP_KERNEL); 128 if (!image) 129 goto out; 130 131 image->head = 0; 132 image->entry = &image->head; 133 image->last_entry = &image->head; 134 image->control_page = ~0; /* By default this does not apply */ 135 image->start = entry; 136 image->type = KEXEC_TYPE_DEFAULT; 137 138 /* Initialize the list of control pages */ 139 INIT_LIST_HEAD(&image->control_pages); 140 141 /* Initialize the list of destination pages */ 142 INIT_LIST_HEAD(&image->dest_pages); 143 144 /* Initialize the list of unuseable pages */ 145 INIT_LIST_HEAD(&image->unuseable_pages); 146 147 /* Read in the segments */ 148 image->nr_segments = nr_segments; 149 segment_bytes = nr_segments * sizeof(*segments); 150 result = copy_from_user(image->segment, segments, segment_bytes); 151 if (result) 152 goto out; 153 154 /* 155 * Verify we have good destination addresses. The caller is 156 * responsible for making certain we don't attempt to load 157 * the new image into invalid or reserved areas of RAM. This 158 * just verifies it is an address we can use. 159 * 160 * Since the kernel does everything in page size chunks ensure 161 * the destination addreses are page aligned. Too many 162 * special cases crop of when we don't do this. The most 163 * insidious is getting overlapping destination addresses 164 * simply because addresses are changed to page size 165 * granularity. 166 */ 167 result = -EADDRNOTAVAIL; 168 for (i = 0; i < nr_segments; i++) { 169 unsigned long mstart, mend; 170 171 mstart = image->segment[i].mem; 172 mend = mstart + image->segment[i].memsz; 173 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) 174 goto out; 175 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) 176 goto out; 177 } 178 179 /* Verify our destination addresses do not overlap. 180 * If we alloed overlapping destination addresses 181 * through very weird things can happen with no 182 * easy explanation as one segment stops on another. 183 */ 184 result = -EINVAL; 185 for (i = 0; i < nr_segments; i++) { 186 unsigned long mstart, mend; 187 unsigned long j; 188 189 mstart = image->segment[i].mem; 190 mend = mstart + image->segment[i].memsz; 191 for (j = 0; j < i; j++) { 192 unsigned long pstart, pend; 193 pstart = image->segment[j].mem; 194 pend = pstart + image->segment[j].memsz; 195 /* Do the segments overlap ? */ 196 if ((mend > pstart) && (mstart < pend)) 197 goto out; 198 } 199 } 200 201 /* Ensure our buffer sizes are strictly less than 202 * our memory sizes. This should always be the case, 203 * and it is easier to check up front than to be surprised 204 * later on. 205 */ 206 result = -EINVAL; 207 for (i = 0; i < nr_segments; i++) { 208 if (image->segment[i].bufsz > image->segment[i].memsz) 209 goto out; 210 } 211 212 result = 0; 213 out: 214 if (result == 0) 215 *rimage = image; 216 else 217 kfree(image); 218 219 return result; 220 221 } 222 223 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 224 unsigned long nr_segments, 225 struct kexec_segment __user *segments) 226 { 227 int result; 228 struct kimage *image; 229 230 /* Allocate and initialize a controlling structure */ 231 image = NULL; 232 result = do_kimage_alloc(&image, entry, nr_segments, segments); 233 if (result) 234 goto out; 235 236 *rimage = image; 237 238 /* 239 * Find a location for the control code buffer, and add it 240 * the vector of segments so that it's pages will also be 241 * counted as destination pages. 242 */ 243 result = -ENOMEM; 244 image->control_code_page = kimage_alloc_control_pages(image, 245 get_order(KEXEC_CONTROL_PAGE_SIZE)); 246 if (!image->control_code_page) { 247 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 248 goto out; 249 } 250 251 image->swap_page = kimage_alloc_control_pages(image, 0); 252 if (!image->swap_page) { 253 printk(KERN_ERR "Could not allocate swap buffer\n"); 254 goto out; 255 } 256 257 result = 0; 258 out: 259 if (result == 0) 260 *rimage = image; 261 else 262 kfree(image); 263 264 return result; 265 } 266 267 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, 268 unsigned long nr_segments, 269 struct kexec_segment __user *segments) 270 { 271 int result; 272 struct kimage *image; 273 unsigned long i; 274 275 image = NULL; 276 /* Verify we have a valid entry point */ 277 if ((entry < crashk_res.start) || (entry > crashk_res.end)) { 278 result = -EADDRNOTAVAIL; 279 goto out; 280 } 281 282 /* Allocate and initialize a controlling structure */ 283 result = do_kimage_alloc(&image, entry, nr_segments, segments); 284 if (result) 285 goto out; 286 287 /* Enable the special crash kernel control page 288 * allocation policy. 289 */ 290 image->control_page = crashk_res.start; 291 image->type = KEXEC_TYPE_CRASH; 292 293 /* 294 * Verify we have good destination addresses. Normally 295 * the caller is responsible for making certain we don't 296 * attempt to load the new image into invalid or reserved 297 * areas of RAM. But crash kernels are preloaded into a 298 * reserved area of ram. We must ensure the addresses 299 * are in the reserved area otherwise preloading the 300 * kernel could corrupt things. 301 */ 302 result = -EADDRNOTAVAIL; 303 for (i = 0; i < nr_segments; i++) { 304 unsigned long mstart, mend; 305 306 mstart = image->segment[i].mem; 307 mend = mstart + image->segment[i].memsz - 1; 308 /* Ensure we are within the crash kernel limits */ 309 if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 310 goto out; 311 } 312 313 /* 314 * Find a location for the control code buffer, and add 315 * the vector of segments so that it's pages will also be 316 * counted as destination pages. 317 */ 318 result = -ENOMEM; 319 image->control_code_page = kimage_alloc_control_pages(image, 320 get_order(KEXEC_CONTROL_PAGE_SIZE)); 321 if (!image->control_code_page) { 322 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 323 goto out; 324 } 325 326 result = 0; 327 out: 328 if (result == 0) 329 *rimage = image; 330 else 331 kfree(image); 332 333 return result; 334 } 335 336 static int kimage_is_destination_range(struct kimage *image, 337 unsigned long start, 338 unsigned long end) 339 { 340 unsigned long i; 341 342 for (i = 0; i < image->nr_segments; i++) { 343 unsigned long mstart, mend; 344 345 mstart = image->segment[i].mem; 346 mend = mstart + image->segment[i].memsz; 347 if ((end > mstart) && (start < mend)) 348 return 1; 349 } 350 351 return 0; 352 } 353 354 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) 355 { 356 struct page *pages; 357 358 pages = alloc_pages(gfp_mask, order); 359 if (pages) { 360 unsigned int count, i; 361 pages->mapping = NULL; 362 set_page_private(pages, order); 363 count = 1 << order; 364 for (i = 0; i < count; i++) 365 SetPageReserved(pages + i); 366 } 367 368 return pages; 369 } 370 371 static void kimage_free_pages(struct page *page) 372 { 373 unsigned int order, count, i; 374 375 order = page_private(page); 376 count = 1 << order; 377 for (i = 0; i < count; i++) 378 ClearPageReserved(page + i); 379 __free_pages(page, order); 380 } 381 382 static void kimage_free_page_list(struct list_head *list) 383 { 384 struct list_head *pos, *next; 385 386 list_for_each_safe(pos, next, list) { 387 struct page *page; 388 389 page = list_entry(pos, struct page, lru); 390 list_del(&page->lru); 391 kimage_free_pages(page); 392 } 393 } 394 395 static struct page *kimage_alloc_normal_control_pages(struct kimage *image, 396 unsigned int order) 397 { 398 /* Control pages are special, they are the intermediaries 399 * that are needed while we copy the rest of the pages 400 * to their final resting place. As such they must 401 * not conflict with either the destination addresses 402 * or memory the kernel is already using. 403 * 404 * The only case where we really need more than one of 405 * these are for architectures where we cannot disable 406 * the MMU and must instead generate an identity mapped 407 * page table for all of the memory. 408 * 409 * At worst this runs in O(N) of the image size. 410 */ 411 struct list_head extra_pages; 412 struct page *pages; 413 unsigned int count; 414 415 count = 1 << order; 416 INIT_LIST_HEAD(&extra_pages); 417 418 /* Loop while I can allocate a page and the page allocated 419 * is a destination page. 420 */ 421 do { 422 unsigned long pfn, epfn, addr, eaddr; 423 424 pages = kimage_alloc_pages(GFP_KERNEL, order); 425 if (!pages) 426 break; 427 pfn = page_to_pfn(pages); 428 epfn = pfn + count; 429 addr = pfn << PAGE_SHIFT; 430 eaddr = epfn << PAGE_SHIFT; 431 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || 432 kimage_is_destination_range(image, addr, eaddr)) { 433 list_add(&pages->lru, &extra_pages); 434 pages = NULL; 435 } 436 } while (!pages); 437 438 if (pages) { 439 /* Remember the allocated page... */ 440 list_add(&pages->lru, &image->control_pages); 441 442 /* Because the page is already in it's destination 443 * location we will never allocate another page at 444 * that address. Therefore kimage_alloc_pages 445 * will not return it (again) and we don't need 446 * to give it an entry in image->segment[]. 447 */ 448 } 449 /* Deal with the destination pages I have inadvertently allocated. 450 * 451 * Ideally I would convert multi-page allocations into single 452 * page allocations, and add everyting to image->dest_pages. 453 * 454 * For now it is simpler to just free the pages. 455 */ 456 kimage_free_page_list(&extra_pages); 457 458 return pages; 459 } 460 461 static struct page *kimage_alloc_crash_control_pages(struct kimage *image, 462 unsigned int order) 463 { 464 /* Control pages are special, they are the intermediaries 465 * that are needed while we copy the rest of the pages 466 * to their final resting place. As such they must 467 * not conflict with either the destination addresses 468 * or memory the kernel is already using. 469 * 470 * Control pages are also the only pags we must allocate 471 * when loading a crash kernel. All of the other pages 472 * are specified by the segments and we just memcpy 473 * into them directly. 474 * 475 * The only case where we really need more than one of 476 * these are for architectures where we cannot disable 477 * the MMU and must instead generate an identity mapped 478 * page table for all of the memory. 479 * 480 * Given the low demand this implements a very simple 481 * allocator that finds the first hole of the appropriate 482 * size in the reserved memory region, and allocates all 483 * of the memory up to and including the hole. 484 */ 485 unsigned long hole_start, hole_end, size; 486 struct page *pages; 487 488 pages = NULL; 489 size = (1 << order) << PAGE_SHIFT; 490 hole_start = (image->control_page + (size - 1)) & ~(size - 1); 491 hole_end = hole_start + size - 1; 492 while (hole_end <= crashk_res.end) { 493 unsigned long i; 494 495 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) 496 break; 497 if (hole_end > crashk_res.end) 498 break; 499 /* See if I overlap any of the segments */ 500 for (i = 0; i < image->nr_segments; i++) { 501 unsigned long mstart, mend; 502 503 mstart = image->segment[i].mem; 504 mend = mstart + image->segment[i].memsz - 1; 505 if ((hole_end >= mstart) && (hole_start <= mend)) { 506 /* Advance the hole to the end of the segment */ 507 hole_start = (mend + (size - 1)) & ~(size - 1); 508 hole_end = hole_start + size - 1; 509 break; 510 } 511 } 512 /* If I don't overlap any segments I have found my hole! */ 513 if (i == image->nr_segments) { 514 pages = pfn_to_page(hole_start >> PAGE_SHIFT); 515 break; 516 } 517 } 518 if (pages) 519 image->control_page = hole_end; 520 521 return pages; 522 } 523 524 525 struct page *kimage_alloc_control_pages(struct kimage *image, 526 unsigned int order) 527 { 528 struct page *pages = NULL; 529 530 switch (image->type) { 531 case KEXEC_TYPE_DEFAULT: 532 pages = kimage_alloc_normal_control_pages(image, order); 533 break; 534 case KEXEC_TYPE_CRASH: 535 pages = kimage_alloc_crash_control_pages(image, order); 536 break; 537 } 538 539 return pages; 540 } 541 542 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) 543 { 544 if (*image->entry != 0) 545 image->entry++; 546 547 if (image->entry == image->last_entry) { 548 kimage_entry_t *ind_page; 549 struct page *page; 550 551 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); 552 if (!page) 553 return -ENOMEM; 554 555 ind_page = page_address(page); 556 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; 557 image->entry = ind_page; 558 image->last_entry = ind_page + 559 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); 560 } 561 *image->entry = entry; 562 image->entry++; 563 *image->entry = 0; 564 565 return 0; 566 } 567 568 static int kimage_set_destination(struct kimage *image, 569 unsigned long destination) 570 { 571 int result; 572 573 destination &= PAGE_MASK; 574 result = kimage_add_entry(image, destination | IND_DESTINATION); 575 if (result == 0) 576 image->destination = destination; 577 578 return result; 579 } 580 581 582 static int kimage_add_page(struct kimage *image, unsigned long page) 583 { 584 int result; 585 586 page &= PAGE_MASK; 587 result = kimage_add_entry(image, page | IND_SOURCE); 588 if (result == 0) 589 image->destination += PAGE_SIZE; 590 591 return result; 592 } 593 594 595 static void kimage_free_extra_pages(struct kimage *image) 596 { 597 /* Walk through and free any extra destination pages I may have */ 598 kimage_free_page_list(&image->dest_pages); 599 600 /* Walk through and free any unuseable pages I have cached */ 601 kimage_free_page_list(&image->unuseable_pages); 602 603 } 604 static void kimage_terminate(struct kimage *image) 605 { 606 if (*image->entry != 0) 607 image->entry++; 608 609 *image->entry = IND_DONE; 610 } 611 612 #define for_each_kimage_entry(image, ptr, entry) \ 613 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 614 ptr = (entry & IND_INDIRECTION)? \ 615 phys_to_virt((entry & PAGE_MASK)): ptr +1) 616 617 static void kimage_free_entry(kimage_entry_t entry) 618 { 619 struct page *page; 620 621 page = pfn_to_page(entry >> PAGE_SHIFT); 622 kimage_free_pages(page); 623 } 624 625 static void kimage_free(struct kimage *image) 626 { 627 kimage_entry_t *ptr, entry; 628 kimage_entry_t ind = 0; 629 630 if (!image) 631 return; 632 633 kimage_free_extra_pages(image); 634 for_each_kimage_entry(image, ptr, entry) { 635 if (entry & IND_INDIRECTION) { 636 /* Free the previous indirection page */ 637 if (ind & IND_INDIRECTION) 638 kimage_free_entry(ind); 639 /* Save this indirection page until we are 640 * done with it. 641 */ 642 ind = entry; 643 } 644 else if (entry & IND_SOURCE) 645 kimage_free_entry(entry); 646 } 647 /* Free the final indirection page */ 648 if (ind & IND_INDIRECTION) 649 kimage_free_entry(ind); 650 651 /* Handle any machine specific cleanup */ 652 machine_kexec_cleanup(image); 653 654 /* Free the kexec control pages... */ 655 kimage_free_page_list(&image->control_pages); 656 kfree(image); 657 } 658 659 static kimage_entry_t *kimage_dst_used(struct kimage *image, 660 unsigned long page) 661 { 662 kimage_entry_t *ptr, entry; 663 unsigned long destination = 0; 664 665 for_each_kimage_entry(image, ptr, entry) { 666 if (entry & IND_DESTINATION) 667 destination = entry & PAGE_MASK; 668 else if (entry & IND_SOURCE) { 669 if (page == destination) 670 return ptr; 671 destination += PAGE_SIZE; 672 } 673 } 674 675 return NULL; 676 } 677 678 static struct page *kimage_alloc_page(struct kimage *image, 679 gfp_t gfp_mask, 680 unsigned long destination) 681 { 682 /* 683 * Here we implement safeguards to ensure that a source page 684 * is not copied to its destination page before the data on 685 * the destination page is no longer useful. 686 * 687 * To do this we maintain the invariant that a source page is 688 * either its own destination page, or it is not a 689 * destination page at all. 690 * 691 * That is slightly stronger than required, but the proof 692 * that no problems will not occur is trivial, and the 693 * implementation is simply to verify. 694 * 695 * When allocating all pages normally this algorithm will run 696 * in O(N) time, but in the worst case it will run in O(N^2) 697 * time. If the runtime is a problem the data structures can 698 * be fixed. 699 */ 700 struct page *page; 701 unsigned long addr; 702 703 /* 704 * Walk through the list of destination pages, and see if I 705 * have a match. 706 */ 707 list_for_each_entry(page, &image->dest_pages, lru) { 708 addr = page_to_pfn(page) << PAGE_SHIFT; 709 if (addr == destination) { 710 list_del(&page->lru); 711 return page; 712 } 713 } 714 page = NULL; 715 while (1) { 716 kimage_entry_t *old; 717 718 /* Allocate a page, if we run out of memory give up */ 719 page = kimage_alloc_pages(gfp_mask, 0); 720 if (!page) 721 return NULL; 722 /* If the page cannot be used file it away */ 723 if (page_to_pfn(page) > 724 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { 725 list_add(&page->lru, &image->unuseable_pages); 726 continue; 727 } 728 addr = page_to_pfn(page) << PAGE_SHIFT; 729 730 /* If it is the destination page we want use it */ 731 if (addr == destination) 732 break; 733 734 /* If the page is not a destination page use it */ 735 if (!kimage_is_destination_range(image, addr, 736 addr + PAGE_SIZE)) 737 break; 738 739 /* 740 * I know that the page is someones destination page. 741 * See if there is already a source page for this 742 * destination page. And if so swap the source pages. 743 */ 744 old = kimage_dst_used(image, addr); 745 if (old) { 746 /* If so move it */ 747 unsigned long old_addr; 748 struct page *old_page; 749 750 old_addr = *old & PAGE_MASK; 751 old_page = pfn_to_page(old_addr >> PAGE_SHIFT); 752 copy_highpage(page, old_page); 753 *old = addr | (*old & ~PAGE_MASK); 754 755 /* The old page I have found cannot be a 756 * destination page, so return it. 757 */ 758 addr = old_addr; 759 page = old_page; 760 break; 761 } 762 else { 763 /* Place the page on the destination list I 764 * will use it later. 765 */ 766 list_add(&page->lru, &image->dest_pages); 767 } 768 } 769 770 return page; 771 } 772 773 static int kimage_load_normal_segment(struct kimage *image, 774 struct kexec_segment *segment) 775 { 776 unsigned long maddr; 777 unsigned long ubytes, mbytes; 778 int result; 779 unsigned char __user *buf; 780 781 result = 0; 782 buf = segment->buf; 783 ubytes = segment->bufsz; 784 mbytes = segment->memsz; 785 maddr = segment->mem; 786 787 result = kimage_set_destination(image, maddr); 788 if (result < 0) 789 goto out; 790 791 while (mbytes) { 792 struct page *page; 793 char *ptr; 794 size_t uchunk, mchunk; 795 796 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); 797 if (!page) { 798 result = -ENOMEM; 799 goto out; 800 } 801 result = kimage_add_page(image, page_to_pfn(page) 802 << PAGE_SHIFT); 803 if (result < 0) 804 goto out; 805 806 ptr = kmap(page); 807 /* Start with a clear page */ 808 memset(ptr, 0, PAGE_SIZE); 809 ptr += maddr & ~PAGE_MASK; 810 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 811 if (mchunk > mbytes) 812 mchunk = mbytes; 813 814 uchunk = mchunk; 815 if (uchunk > ubytes) 816 uchunk = ubytes; 817 818 result = copy_from_user(ptr, buf, uchunk); 819 kunmap(page); 820 if (result) { 821 result = (result < 0) ? result : -EIO; 822 goto out; 823 } 824 ubytes -= uchunk; 825 maddr += mchunk; 826 buf += mchunk; 827 mbytes -= mchunk; 828 } 829 out: 830 return result; 831 } 832 833 static int kimage_load_crash_segment(struct kimage *image, 834 struct kexec_segment *segment) 835 { 836 /* For crash dumps kernels we simply copy the data from 837 * user space to it's destination. 838 * We do things a page at a time for the sake of kmap. 839 */ 840 unsigned long maddr; 841 unsigned long ubytes, mbytes; 842 int result; 843 unsigned char __user *buf; 844 845 result = 0; 846 buf = segment->buf; 847 ubytes = segment->bufsz; 848 mbytes = segment->memsz; 849 maddr = segment->mem; 850 while (mbytes) { 851 struct page *page; 852 char *ptr; 853 size_t uchunk, mchunk; 854 855 page = pfn_to_page(maddr >> PAGE_SHIFT); 856 if (!page) { 857 result = -ENOMEM; 858 goto out; 859 } 860 ptr = kmap(page); 861 ptr += maddr & ~PAGE_MASK; 862 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 863 if (mchunk > mbytes) 864 mchunk = mbytes; 865 866 uchunk = mchunk; 867 if (uchunk > ubytes) { 868 uchunk = ubytes; 869 /* Zero the trailing part of the page */ 870 memset(ptr + uchunk, 0, mchunk - uchunk); 871 } 872 result = copy_from_user(ptr, buf, uchunk); 873 kexec_flush_icache_page(page); 874 kunmap(page); 875 if (result) { 876 result = (result < 0) ? result : -EIO; 877 goto out; 878 } 879 ubytes -= uchunk; 880 maddr += mchunk; 881 buf += mchunk; 882 mbytes -= mchunk; 883 } 884 out: 885 return result; 886 } 887 888 static int kimage_load_segment(struct kimage *image, 889 struct kexec_segment *segment) 890 { 891 int result = -ENOMEM; 892 893 switch (image->type) { 894 case KEXEC_TYPE_DEFAULT: 895 result = kimage_load_normal_segment(image, segment); 896 break; 897 case KEXEC_TYPE_CRASH: 898 result = kimage_load_crash_segment(image, segment); 899 break; 900 } 901 902 return result; 903 } 904 905 /* 906 * Exec Kernel system call: for obvious reasons only root may call it. 907 * 908 * This call breaks up into three pieces. 909 * - A generic part which loads the new kernel from the current 910 * address space, and very carefully places the data in the 911 * allocated pages. 912 * 913 * - A generic part that interacts with the kernel and tells all of 914 * the devices to shut down. Preventing on-going dmas, and placing 915 * the devices in a consistent state so a later kernel can 916 * reinitialize them. 917 * 918 * - A machine specific part that includes the syscall number 919 * and the copies the image to it's final destination. And 920 * jumps into the image at entry. 921 * 922 * kexec does not sync, or unmount filesystems so if you need 923 * that to happen you need to do that yourself. 924 */ 925 struct kimage *kexec_image; 926 struct kimage *kexec_crash_image; 927 928 static DEFINE_MUTEX(kexec_mutex); 929 930 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 931 struct kexec_segment __user *segments, 932 unsigned long flags) 933 { 934 struct kimage **dest_image, *image; 935 int result; 936 937 /* We only trust the superuser with rebooting the system. */ 938 if (!capable(CAP_SYS_BOOT)) 939 return -EPERM; 940 941 /* 942 * Verify we have a legal set of flags 943 * This leaves us room for future extensions. 944 */ 945 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) 946 return -EINVAL; 947 948 /* Verify we are on the appropriate architecture */ 949 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && 950 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) 951 return -EINVAL; 952 953 /* Put an artificial cap on the number 954 * of segments passed to kexec_load. 955 */ 956 if (nr_segments > KEXEC_SEGMENT_MAX) 957 return -EINVAL; 958 959 image = NULL; 960 result = 0; 961 962 /* Because we write directly to the reserved memory 963 * region when loading crash kernels we need a mutex here to 964 * prevent multiple crash kernels from attempting to load 965 * simultaneously, and to prevent a crash kernel from loading 966 * over the top of a in use crash kernel. 967 * 968 * KISS: always take the mutex. 969 */ 970 if (!mutex_trylock(&kexec_mutex)) 971 return -EBUSY; 972 973 dest_image = &kexec_image; 974 if (flags & KEXEC_ON_CRASH) 975 dest_image = &kexec_crash_image; 976 if (nr_segments > 0) { 977 unsigned long i; 978 979 /* Loading another kernel to reboot into */ 980 if ((flags & KEXEC_ON_CRASH) == 0) 981 result = kimage_normal_alloc(&image, entry, 982 nr_segments, segments); 983 /* Loading another kernel to switch to if this one crashes */ 984 else if (flags & KEXEC_ON_CRASH) { 985 /* Free any current crash dump kernel before 986 * we corrupt it. 987 */ 988 kimage_free(xchg(&kexec_crash_image, NULL)); 989 result = kimage_crash_alloc(&image, entry, 990 nr_segments, segments); 991 } 992 if (result) 993 goto out; 994 995 if (flags & KEXEC_PRESERVE_CONTEXT) 996 image->preserve_context = 1; 997 result = machine_kexec_prepare(image); 998 if (result) 999 goto out; 1000 1001 for (i = 0; i < nr_segments; i++) { 1002 result = kimage_load_segment(image, &image->segment[i]); 1003 if (result) 1004 goto out; 1005 } 1006 kimage_terminate(image); 1007 } 1008 /* Install the new kernel, and Uninstall the old */ 1009 image = xchg(dest_image, image); 1010 1011 out: 1012 mutex_unlock(&kexec_mutex); 1013 kimage_free(image); 1014 1015 return result; 1016 } 1017 1018 #ifdef CONFIG_COMPAT 1019 asmlinkage long compat_sys_kexec_load(unsigned long entry, 1020 unsigned long nr_segments, 1021 struct compat_kexec_segment __user *segments, 1022 unsigned long flags) 1023 { 1024 struct compat_kexec_segment in; 1025 struct kexec_segment out, __user *ksegments; 1026 unsigned long i, result; 1027 1028 /* Don't allow clients that don't understand the native 1029 * architecture to do anything. 1030 */ 1031 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) 1032 return -EINVAL; 1033 1034 if (nr_segments > KEXEC_SEGMENT_MAX) 1035 return -EINVAL; 1036 1037 ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); 1038 for (i=0; i < nr_segments; i++) { 1039 result = copy_from_user(&in, &segments[i], sizeof(in)); 1040 if (result) 1041 return -EFAULT; 1042 1043 out.buf = compat_ptr(in.buf); 1044 out.bufsz = in.bufsz; 1045 out.mem = in.mem; 1046 out.memsz = in.memsz; 1047 1048 result = copy_to_user(&ksegments[i], &out, sizeof(out)); 1049 if (result) 1050 return -EFAULT; 1051 } 1052 1053 return sys_kexec_load(entry, nr_segments, ksegments, flags); 1054 } 1055 #endif 1056 1057 void crash_kexec(struct pt_regs *regs) 1058 { 1059 /* Take the kexec_mutex here to prevent sys_kexec_load 1060 * running on one cpu from replacing the crash kernel 1061 * we are using after a panic on a different cpu. 1062 * 1063 * If the crash kernel was not located in a fixed area 1064 * of memory the xchg(&kexec_crash_image) would be 1065 * sufficient. But since I reuse the memory... 1066 */ 1067 if (mutex_trylock(&kexec_mutex)) { 1068 if (kexec_crash_image) { 1069 struct pt_regs fixed_regs; 1070 crash_setup_regs(&fixed_regs, regs); 1071 crash_save_vmcoreinfo(); 1072 machine_crash_shutdown(&fixed_regs); 1073 machine_kexec(kexec_crash_image); 1074 } 1075 mutex_unlock(&kexec_mutex); 1076 } 1077 } 1078 1079 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, 1080 size_t data_len) 1081 { 1082 struct elf_note note; 1083 1084 note.n_namesz = strlen(name) + 1; 1085 note.n_descsz = data_len; 1086 note.n_type = type; 1087 memcpy(buf, ¬e, sizeof(note)); 1088 buf += (sizeof(note) + 3)/4; 1089 memcpy(buf, name, note.n_namesz); 1090 buf += (note.n_namesz + 3)/4; 1091 memcpy(buf, data, note.n_descsz); 1092 buf += (note.n_descsz + 3)/4; 1093 1094 return buf; 1095 } 1096 1097 static void final_note(u32 *buf) 1098 { 1099 struct elf_note note; 1100 1101 note.n_namesz = 0; 1102 note.n_descsz = 0; 1103 note.n_type = 0; 1104 memcpy(buf, ¬e, sizeof(note)); 1105 } 1106 1107 void crash_save_cpu(struct pt_regs *regs, int cpu) 1108 { 1109 struct elf_prstatus prstatus; 1110 u32 *buf; 1111 1112 if ((cpu < 0) || (cpu >= NR_CPUS)) 1113 return; 1114 1115 /* Using ELF notes here is opportunistic. 1116 * I need a well defined structure format 1117 * for the data I pass, and I need tags 1118 * on the data to indicate what information I have 1119 * squirrelled away. ELF notes happen to provide 1120 * all of that, so there is no need to invent something new. 1121 */ 1122 buf = (u32*)per_cpu_ptr(crash_notes, cpu); 1123 if (!buf) 1124 return; 1125 memset(&prstatus, 0, sizeof(prstatus)); 1126 prstatus.pr_pid = current->pid; 1127 elf_core_copy_regs(&prstatus.pr_reg, regs); 1128 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, 1129 &prstatus, sizeof(prstatus)); 1130 final_note(buf); 1131 } 1132 1133 static int __init crash_notes_memory_init(void) 1134 { 1135 /* Allocate memory for saving cpu registers. */ 1136 crash_notes = alloc_percpu(note_buf_t); 1137 if (!crash_notes) { 1138 printk("Kexec: Memory allocation for saving cpu register" 1139 " states failed\n"); 1140 return -ENOMEM; 1141 } 1142 return 0; 1143 } 1144 module_init(crash_notes_memory_init) 1145 1146 1147 /* 1148 * parsing the "crashkernel" commandline 1149 * 1150 * this code is intended to be called from architecture specific code 1151 */ 1152 1153 1154 /* 1155 * This function parses command lines in the format 1156 * 1157 * crashkernel=ramsize-range:size[,...][@offset] 1158 * 1159 * The function returns 0 on success and -EINVAL on failure. 1160 */ 1161 static int __init parse_crashkernel_mem(char *cmdline, 1162 unsigned long long system_ram, 1163 unsigned long long *crash_size, 1164 unsigned long long *crash_base) 1165 { 1166 char *cur = cmdline, *tmp; 1167 1168 /* for each entry of the comma-separated list */ 1169 do { 1170 unsigned long long start, end = ULLONG_MAX, size; 1171 1172 /* get the start of the range */ 1173 start = memparse(cur, &tmp); 1174 if (cur == tmp) { 1175 pr_warning("crashkernel: Memory value expected\n"); 1176 return -EINVAL; 1177 } 1178 cur = tmp; 1179 if (*cur != '-') { 1180 pr_warning("crashkernel: '-' expected\n"); 1181 return -EINVAL; 1182 } 1183 cur++; 1184 1185 /* if no ':' is here, than we read the end */ 1186 if (*cur != ':') { 1187 end = memparse(cur, &tmp); 1188 if (cur == tmp) { 1189 pr_warning("crashkernel: Memory " 1190 "value expected\n"); 1191 return -EINVAL; 1192 } 1193 cur = tmp; 1194 if (end <= start) { 1195 pr_warning("crashkernel: end <= start\n"); 1196 return -EINVAL; 1197 } 1198 } 1199 1200 if (*cur != ':') { 1201 pr_warning("crashkernel: ':' expected\n"); 1202 return -EINVAL; 1203 } 1204 cur++; 1205 1206 size = memparse(cur, &tmp); 1207 if (cur == tmp) { 1208 pr_warning("Memory value expected\n"); 1209 return -EINVAL; 1210 } 1211 cur = tmp; 1212 if (size >= system_ram) { 1213 pr_warning("crashkernel: invalid size\n"); 1214 return -EINVAL; 1215 } 1216 1217 /* match ? */ 1218 if (system_ram >= start && system_ram < end) { 1219 *crash_size = size; 1220 break; 1221 } 1222 } while (*cur++ == ','); 1223 1224 if (*crash_size > 0) { 1225 while (*cur != ' ' && *cur != '@') 1226 cur++; 1227 if (*cur == '@') { 1228 cur++; 1229 *crash_base = memparse(cur, &tmp); 1230 if (cur == tmp) { 1231 pr_warning("Memory value expected " 1232 "after '@'\n"); 1233 return -EINVAL; 1234 } 1235 } 1236 } 1237 1238 return 0; 1239 } 1240 1241 /* 1242 * That function parses "simple" (old) crashkernel command lines like 1243 * 1244 * crashkernel=size[@offset] 1245 * 1246 * It returns 0 on success and -EINVAL on failure. 1247 */ 1248 static int __init parse_crashkernel_simple(char *cmdline, 1249 unsigned long long *crash_size, 1250 unsigned long long *crash_base) 1251 { 1252 char *cur = cmdline; 1253 1254 *crash_size = memparse(cmdline, &cur); 1255 if (cmdline == cur) { 1256 pr_warning("crashkernel: memory value expected\n"); 1257 return -EINVAL; 1258 } 1259 1260 if (*cur == '@') 1261 *crash_base = memparse(cur+1, &cur); 1262 1263 return 0; 1264 } 1265 1266 /* 1267 * That function is the entry point for command line parsing and should be 1268 * called from the arch-specific code. 1269 */ 1270 int __init parse_crashkernel(char *cmdline, 1271 unsigned long long system_ram, 1272 unsigned long long *crash_size, 1273 unsigned long long *crash_base) 1274 { 1275 char *p = cmdline, *ck_cmdline = NULL; 1276 char *first_colon, *first_space; 1277 1278 BUG_ON(!crash_size || !crash_base); 1279 *crash_size = 0; 1280 *crash_base = 0; 1281 1282 /* find crashkernel and use the last one if there are more */ 1283 p = strstr(p, "crashkernel="); 1284 while (p) { 1285 ck_cmdline = p; 1286 p = strstr(p+1, "crashkernel="); 1287 } 1288 1289 if (!ck_cmdline) 1290 return -EINVAL; 1291 1292 ck_cmdline += 12; /* strlen("crashkernel=") */ 1293 1294 /* 1295 * if the commandline contains a ':', then that's the extended 1296 * syntax -- if not, it must be the classic syntax 1297 */ 1298 first_colon = strchr(ck_cmdline, ':'); 1299 first_space = strchr(ck_cmdline, ' '); 1300 if (first_colon && (!first_space || first_colon < first_space)) 1301 return parse_crashkernel_mem(ck_cmdline, system_ram, 1302 crash_size, crash_base); 1303 else 1304 return parse_crashkernel_simple(ck_cmdline, crash_size, 1305 crash_base); 1306 1307 return 0; 1308 } 1309 1310 1311 1312 void crash_save_vmcoreinfo(void) 1313 { 1314 u32 *buf; 1315 1316 if (!vmcoreinfo_size) 1317 return; 1318 1319 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); 1320 1321 buf = (u32 *)vmcoreinfo_note; 1322 1323 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, 1324 vmcoreinfo_size); 1325 1326 final_note(buf); 1327 } 1328 1329 void vmcoreinfo_append_str(const char *fmt, ...) 1330 { 1331 va_list args; 1332 char buf[0x50]; 1333 int r; 1334 1335 va_start(args, fmt); 1336 r = vsnprintf(buf, sizeof(buf), fmt, args); 1337 va_end(args); 1338 1339 if (r + vmcoreinfo_size > vmcoreinfo_max_size) 1340 r = vmcoreinfo_max_size - vmcoreinfo_size; 1341 1342 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); 1343 1344 vmcoreinfo_size += r; 1345 } 1346 1347 /* 1348 * provide an empty default implementation here -- architecture 1349 * code may override this 1350 */ 1351 void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) 1352 {} 1353 1354 unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) 1355 { 1356 return __pa((unsigned long)(char *)&vmcoreinfo_note); 1357 } 1358 1359 static int __init crash_save_vmcoreinfo_init(void) 1360 { 1361 VMCOREINFO_OSRELEASE(init_uts_ns.name.release); 1362 VMCOREINFO_PAGESIZE(PAGE_SIZE); 1363 1364 VMCOREINFO_SYMBOL(init_uts_ns); 1365 VMCOREINFO_SYMBOL(node_online_map); 1366 VMCOREINFO_SYMBOL(swapper_pg_dir); 1367 VMCOREINFO_SYMBOL(_stext); 1368 1369 #ifndef CONFIG_NEED_MULTIPLE_NODES 1370 VMCOREINFO_SYMBOL(mem_map); 1371 VMCOREINFO_SYMBOL(contig_page_data); 1372 #endif 1373 #ifdef CONFIG_SPARSEMEM 1374 VMCOREINFO_SYMBOL(mem_section); 1375 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); 1376 VMCOREINFO_STRUCT_SIZE(mem_section); 1377 VMCOREINFO_OFFSET(mem_section, section_mem_map); 1378 #endif 1379 VMCOREINFO_STRUCT_SIZE(page); 1380 VMCOREINFO_STRUCT_SIZE(pglist_data); 1381 VMCOREINFO_STRUCT_SIZE(zone); 1382 VMCOREINFO_STRUCT_SIZE(free_area); 1383 VMCOREINFO_STRUCT_SIZE(list_head); 1384 VMCOREINFO_SIZE(nodemask_t); 1385 VMCOREINFO_OFFSET(page, flags); 1386 VMCOREINFO_OFFSET(page, _count); 1387 VMCOREINFO_OFFSET(page, mapping); 1388 VMCOREINFO_OFFSET(page, lru); 1389 VMCOREINFO_OFFSET(pglist_data, node_zones); 1390 VMCOREINFO_OFFSET(pglist_data, nr_zones); 1391 #ifdef CONFIG_FLAT_NODE_MEM_MAP 1392 VMCOREINFO_OFFSET(pglist_data, node_mem_map); 1393 #endif 1394 VMCOREINFO_OFFSET(pglist_data, node_start_pfn); 1395 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); 1396 VMCOREINFO_OFFSET(pglist_data, node_id); 1397 VMCOREINFO_OFFSET(zone, free_area); 1398 VMCOREINFO_OFFSET(zone, vm_stat); 1399 VMCOREINFO_OFFSET(zone, spanned_pages); 1400 VMCOREINFO_OFFSET(free_area, free_list); 1401 VMCOREINFO_OFFSET(list_head, next); 1402 VMCOREINFO_OFFSET(list_head, prev); 1403 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1404 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1405 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1406 VMCOREINFO_NUMBER(PG_lru); 1407 VMCOREINFO_NUMBER(PG_private); 1408 VMCOREINFO_NUMBER(PG_swapcache); 1409 1410 arch_crash_save_vmcoreinfo(); 1411 1412 return 0; 1413 } 1414 1415 module_init(crash_save_vmcoreinfo_init) 1416 1417 /* 1418 * Move into place and start executing a preloaded standalone 1419 * executable. If nothing was preloaded return an error. 1420 */ 1421 int kernel_kexec(void) 1422 { 1423 int error = 0; 1424 1425 if (!mutex_trylock(&kexec_mutex)) 1426 return -EBUSY; 1427 if (!kexec_image) { 1428 error = -EINVAL; 1429 goto Unlock; 1430 } 1431 1432 #ifdef CONFIG_KEXEC_JUMP 1433 if (kexec_image->preserve_context) { 1434 mutex_lock(&pm_mutex); 1435 pm_prepare_console(); 1436 error = freeze_processes(); 1437 if (error) { 1438 error = -EBUSY; 1439 goto Restore_console; 1440 } 1441 suspend_console(); 1442 error = device_suspend(PMSG_FREEZE); 1443 if (error) 1444 goto Resume_console; 1445 error = disable_nonboot_cpus(); 1446 if (error) 1447 goto Resume_devices; 1448 device_pm_lock(); 1449 local_irq_disable(); 1450 /* At this point, device_suspend() has been called, 1451 * but *not* device_power_down(). We *must* 1452 * device_power_down() now. Otherwise, drivers for 1453 * some devices (e.g. interrupt controllers) become 1454 * desynchronized with the actual state of the 1455 * hardware at resume time, and evil weirdness ensues. 1456 */ 1457 error = device_power_down(PMSG_FREEZE); 1458 if (error) 1459 goto Enable_irqs; 1460 } else 1461 #endif 1462 { 1463 kernel_restart_prepare(NULL); 1464 printk(KERN_EMERG "Starting new kernel\n"); 1465 machine_shutdown(); 1466 } 1467 1468 machine_kexec(kexec_image); 1469 1470 #ifdef CONFIG_KEXEC_JUMP 1471 if (kexec_image->preserve_context) { 1472 device_power_up(PMSG_RESTORE); 1473 Enable_irqs: 1474 local_irq_enable(); 1475 device_pm_unlock(); 1476 enable_nonboot_cpus(); 1477 Resume_devices: 1478 device_resume(PMSG_RESTORE); 1479 Resume_console: 1480 resume_console(); 1481 thaw_processes(); 1482 Restore_console: 1483 pm_restore_console(); 1484 mutex_unlock(&pm_mutex); 1485 } 1486 #endif 1487 1488 Unlock: 1489 mutex_unlock(&kexec_mutex); 1490 return error; 1491 } 1492