1 /* 2 * kexec.c - kexec system call 3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 4 * 5 * This source code is licensed under the GNU General Public License, 6 * Version 2. See the file COPYING for more details. 7 */ 8 9 #include <linux/capability.h> 10 #include <linux/mm.h> 11 #include <linux/file.h> 12 #include <linux/slab.h> 13 #include <linux/fs.h> 14 #include <linux/kexec.h> 15 #include <linux/mutex.h> 16 #include <linux/list.h> 17 #include <linux/highmem.h> 18 #include <linux/syscalls.h> 19 #include <linux/reboot.h> 20 #include <linux/ioport.h> 21 #include <linux/hardirq.h> 22 #include <linux/elf.h> 23 #include <linux/elfcore.h> 24 #include <linux/utsrelease.h> 25 #include <linux/utsname.h> 26 #include <linux/numa.h> 27 #include <linux/suspend.h> 28 #include <linux/device.h> 29 #include <linux/freezer.h> 30 #include <linux/pm.h> 31 #include <linux/cpu.h> 32 #include <linux/console.h> 33 34 #include <asm/page.h> 35 #include <asm/uaccess.h> 36 #include <asm/io.h> 37 #include <asm/system.h> 38 #include <asm/sections.h> 39 40 /* Per cpu memory for storing cpu states in case of system crash. */ 41 note_buf_t* crash_notes; 42 43 /* vmcoreinfo stuff */ 44 unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 45 u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; 46 size_t vmcoreinfo_size; 47 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); 48 49 /* Location of the reserved area for the crash kernel */ 50 struct resource crashk_res = { 51 .name = "Crash kernel", 52 .start = 0, 53 .end = 0, 54 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 55 }; 56 57 int kexec_should_crash(struct task_struct *p) 58 { 59 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) 60 return 1; 61 return 0; 62 } 63 64 /* 65 * When kexec transitions to the new kernel there is a one-to-one 66 * mapping between physical and virtual addresses. On processors 67 * where you can disable the MMU this is trivial, and easy. For 68 * others it is still a simple predictable page table to setup. 69 * 70 * In that environment kexec copies the new kernel to its final 71 * resting place. This means I can only support memory whose 72 * physical address can fit in an unsigned long. In particular 73 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. 74 * If the assembly stub has more restrictive requirements 75 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be 76 * defined more restrictively in <asm/kexec.h>. 77 * 78 * The code for the transition from the current kernel to the 79 * the new kernel is placed in the control_code_buffer, whose size 80 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single 81 * page of memory is necessary, but some architectures require more. 82 * Because this memory must be identity mapped in the transition from 83 * virtual to physical addresses it must live in the range 84 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily 85 * modifiable. 86 * 87 * The assembly stub in the control code buffer is passed a linked list 88 * of descriptor pages detailing the source pages of the new kernel, 89 * and the destination addresses of those source pages. As this data 90 * structure is not used in the context of the current OS, it must 91 * be self-contained. 92 * 93 * The code has been made to work with highmem pages and will use a 94 * destination page in its final resting place (if it happens 95 * to allocate it). The end product of this is that most of the 96 * physical address space, and most of RAM can be used. 97 * 98 * Future directions include: 99 * - allocating a page table with the control code buffer identity 100 * mapped, to simplify machine_kexec and make kexec_on_panic more 101 * reliable. 102 */ 103 104 /* 105 * KIMAGE_NO_DEST is an impossible destination address..., for 106 * allocating pages whose destination address we do not care about. 107 */ 108 #define KIMAGE_NO_DEST (-1UL) 109 110 static int kimage_is_destination_range(struct kimage *image, 111 unsigned long start, unsigned long end); 112 static struct page *kimage_alloc_page(struct kimage *image, 113 gfp_t gfp_mask, 114 unsigned long dest); 115 116 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 117 unsigned long nr_segments, 118 struct kexec_segment __user *segments) 119 { 120 size_t segment_bytes; 121 struct kimage *image; 122 unsigned long i; 123 int result; 124 125 /* Allocate a controlling structure */ 126 result = -ENOMEM; 127 image = kzalloc(sizeof(*image), GFP_KERNEL); 128 if (!image) 129 goto out; 130 131 image->head = 0; 132 image->entry = &image->head; 133 image->last_entry = &image->head; 134 image->control_page = ~0; /* By default this does not apply */ 135 image->start = entry; 136 image->type = KEXEC_TYPE_DEFAULT; 137 138 /* Initialize the list of control pages */ 139 INIT_LIST_HEAD(&image->control_pages); 140 141 /* Initialize the list of destination pages */ 142 INIT_LIST_HEAD(&image->dest_pages); 143 144 /* Initialize the list of unuseable pages */ 145 INIT_LIST_HEAD(&image->unuseable_pages); 146 147 /* Read in the segments */ 148 image->nr_segments = nr_segments; 149 segment_bytes = nr_segments * sizeof(*segments); 150 result = copy_from_user(image->segment, segments, segment_bytes); 151 if (result) 152 goto out; 153 154 /* 155 * Verify we have good destination addresses. The caller is 156 * responsible for making certain we don't attempt to load 157 * the new image into invalid or reserved areas of RAM. This 158 * just verifies it is an address we can use. 159 * 160 * Since the kernel does everything in page size chunks ensure 161 * the destination addreses are page aligned. Too many 162 * special cases crop of when we don't do this. The most 163 * insidious is getting overlapping destination addresses 164 * simply because addresses are changed to page size 165 * granularity. 166 */ 167 result = -EADDRNOTAVAIL; 168 for (i = 0; i < nr_segments; i++) { 169 unsigned long mstart, mend; 170 171 mstart = image->segment[i].mem; 172 mend = mstart + image->segment[i].memsz; 173 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) 174 goto out; 175 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) 176 goto out; 177 } 178 179 /* Verify our destination addresses do not overlap. 180 * If we alloed overlapping destination addresses 181 * through very weird things can happen with no 182 * easy explanation as one segment stops on another. 183 */ 184 result = -EINVAL; 185 for (i = 0; i < nr_segments; i++) { 186 unsigned long mstart, mend; 187 unsigned long j; 188 189 mstart = image->segment[i].mem; 190 mend = mstart + image->segment[i].memsz; 191 for (j = 0; j < i; j++) { 192 unsigned long pstart, pend; 193 pstart = image->segment[j].mem; 194 pend = pstart + image->segment[j].memsz; 195 /* Do the segments overlap ? */ 196 if ((mend > pstart) && (mstart < pend)) 197 goto out; 198 } 199 } 200 201 /* Ensure our buffer sizes are strictly less than 202 * our memory sizes. This should always be the case, 203 * and it is easier to check up front than to be surprised 204 * later on. 205 */ 206 result = -EINVAL; 207 for (i = 0; i < nr_segments; i++) { 208 if (image->segment[i].bufsz > image->segment[i].memsz) 209 goto out; 210 } 211 212 result = 0; 213 out: 214 if (result == 0) 215 *rimage = image; 216 else 217 kfree(image); 218 219 return result; 220 221 } 222 223 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 224 unsigned long nr_segments, 225 struct kexec_segment __user *segments) 226 { 227 int result; 228 struct kimage *image; 229 230 /* Allocate and initialize a controlling structure */ 231 image = NULL; 232 result = do_kimage_alloc(&image, entry, nr_segments, segments); 233 if (result) 234 goto out; 235 236 *rimage = image; 237 238 /* 239 * Find a location for the control code buffer, and add it 240 * the vector of segments so that it's pages will also be 241 * counted as destination pages. 242 */ 243 result = -ENOMEM; 244 image->control_code_page = kimage_alloc_control_pages(image, 245 get_order(KEXEC_CONTROL_PAGE_SIZE)); 246 if (!image->control_code_page) { 247 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 248 goto out; 249 } 250 251 image->swap_page = kimage_alloc_control_pages(image, 0); 252 if (!image->swap_page) { 253 printk(KERN_ERR "Could not allocate swap buffer\n"); 254 goto out; 255 } 256 257 result = 0; 258 out: 259 if (result == 0) 260 *rimage = image; 261 else 262 kfree(image); 263 264 return result; 265 } 266 267 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, 268 unsigned long nr_segments, 269 struct kexec_segment __user *segments) 270 { 271 int result; 272 struct kimage *image; 273 unsigned long i; 274 275 image = NULL; 276 /* Verify we have a valid entry point */ 277 if ((entry < crashk_res.start) || (entry > crashk_res.end)) { 278 result = -EADDRNOTAVAIL; 279 goto out; 280 } 281 282 /* Allocate and initialize a controlling structure */ 283 result = do_kimage_alloc(&image, entry, nr_segments, segments); 284 if (result) 285 goto out; 286 287 /* Enable the special crash kernel control page 288 * allocation policy. 289 */ 290 image->control_page = crashk_res.start; 291 image->type = KEXEC_TYPE_CRASH; 292 293 /* 294 * Verify we have good destination addresses. Normally 295 * the caller is responsible for making certain we don't 296 * attempt to load the new image into invalid or reserved 297 * areas of RAM. But crash kernels are preloaded into a 298 * reserved area of ram. We must ensure the addresses 299 * are in the reserved area otherwise preloading the 300 * kernel could corrupt things. 301 */ 302 result = -EADDRNOTAVAIL; 303 for (i = 0; i < nr_segments; i++) { 304 unsigned long mstart, mend; 305 306 mstart = image->segment[i].mem; 307 mend = mstart + image->segment[i].memsz - 1; 308 /* Ensure we are within the crash kernel limits */ 309 if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 310 goto out; 311 } 312 313 /* 314 * Find a location for the control code buffer, and add 315 * the vector of segments so that it's pages will also be 316 * counted as destination pages. 317 */ 318 result = -ENOMEM; 319 image->control_code_page = kimage_alloc_control_pages(image, 320 get_order(KEXEC_CONTROL_PAGE_SIZE)); 321 if (!image->control_code_page) { 322 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 323 goto out; 324 } 325 326 result = 0; 327 out: 328 if (result == 0) 329 *rimage = image; 330 else 331 kfree(image); 332 333 return result; 334 } 335 336 static int kimage_is_destination_range(struct kimage *image, 337 unsigned long start, 338 unsigned long end) 339 { 340 unsigned long i; 341 342 for (i = 0; i < image->nr_segments; i++) { 343 unsigned long mstart, mend; 344 345 mstart = image->segment[i].mem; 346 mend = mstart + image->segment[i].memsz; 347 if ((end > mstart) && (start < mend)) 348 return 1; 349 } 350 351 return 0; 352 } 353 354 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) 355 { 356 struct page *pages; 357 358 pages = alloc_pages(gfp_mask, order); 359 if (pages) { 360 unsigned int count, i; 361 pages->mapping = NULL; 362 set_page_private(pages, order); 363 count = 1 << order; 364 for (i = 0; i < count; i++) 365 SetPageReserved(pages + i); 366 } 367 368 return pages; 369 } 370 371 static void kimage_free_pages(struct page *page) 372 { 373 unsigned int order, count, i; 374 375 order = page_private(page); 376 count = 1 << order; 377 for (i = 0; i < count; i++) 378 ClearPageReserved(page + i); 379 __free_pages(page, order); 380 } 381 382 static void kimage_free_page_list(struct list_head *list) 383 { 384 struct list_head *pos, *next; 385 386 list_for_each_safe(pos, next, list) { 387 struct page *page; 388 389 page = list_entry(pos, struct page, lru); 390 list_del(&page->lru); 391 kimage_free_pages(page); 392 } 393 } 394 395 static struct page *kimage_alloc_normal_control_pages(struct kimage *image, 396 unsigned int order) 397 { 398 /* Control pages are special, they are the intermediaries 399 * that are needed while we copy the rest of the pages 400 * to their final resting place. As such they must 401 * not conflict with either the destination addresses 402 * or memory the kernel is already using. 403 * 404 * The only case where we really need more than one of 405 * these are for architectures where we cannot disable 406 * the MMU and must instead generate an identity mapped 407 * page table for all of the memory. 408 * 409 * At worst this runs in O(N) of the image size. 410 */ 411 struct list_head extra_pages; 412 struct page *pages; 413 unsigned int count; 414 415 count = 1 << order; 416 INIT_LIST_HEAD(&extra_pages); 417 418 /* Loop while I can allocate a page and the page allocated 419 * is a destination page. 420 */ 421 do { 422 unsigned long pfn, epfn, addr, eaddr; 423 424 pages = kimage_alloc_pages(GFP_KERNEL, order); 425 if (!pages) 426 break; 427 pfn = page_to_pfn(pages); 428 epfn = pfn + count; 429 addr = pfn << PAGE_SHIFT; 430 eaddr = epfn << PAGE_SHIFT; 431 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || 432 kimage_is_destination_range(image, addr, eaddr)) { 433 list_add(&pages->lru, &extra_pages); 434 pages = NULL; 435 } 436 } while (!pages); 437 438 if (pages) { 439 /* Remember the allocated page... */ 440 list_add(&pages->lru, &image->control_pages); 441 442 /* Because the page is already in it's destination 443 * location we will never allocate another page at 444 * that address. Therefore kimage_alloc_pages 445 * will not return it (again) and we don't need 446 * to give it an entry in image->segment[]. 447 */ 448 } 449 /* Deal with the destination pages I have inadvertently allocated. 450 * 451 * Ideally I would convert multi-page allocations into single 452 * page allocations, and add everyting to image->dest_pages. 453 * 454 * For now it is simpler to just free the pages. 455 */ 456 kimage_free_page_list(&extra_pages); 457 458 return pages; 459 } 460 461 static struct page *kimage_alloc_crash_control_pages(struct kimage *image, 462 unsigned int order) 463 { 464 /* Control pages are special, they are the intermediaries 465 * that are needed while we copy the rest of the pages 466 * to their final resting place. As such they must 467 * not conflict with either the destination addresses 468 * or memory the kernel is already using. 469 * 470 * Control pages are also the only pags we must allocate 471 * when loading a crash kernel. All of the other pages 472 * are specified by the segments and we just memcpy 473 * into them directly. 474 * 475 * The only case where we really need more than one of 476 * these are for architectures where we cannot disable 477 * the MMU and must instead generate an identity mapped 478 * page table for all of the memory. 479 * 480 * Given the low demand this implements a very simple 481 * allocator that finds the first hole of the appropriate 482 * size in the reserved memory region, and allocates all 483 * of the memory up to and including the hole. 484 */ 485 unsigned long hole_start, hole_end, size; 486 struct page *pages; 487 488 pages = NULL; 489 size = (1 << order) << PAGE_SHIFT; 490 hole_start = (image->control_page + (size - 1)) & ~(size - 1); 491 hole_end = hole_start + size - 1; 492 while (hole_end <= crashk_res.end) { 493 unsigned long i; 494 495 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) 496 break; 497 if (hole_end > crashk_res.end) 498 break; 499 /* See if I overlap any of the segments */ 500 for (i = 0; i < image->nr_segments; i++) { 501 unsigned long mstart, mend; 502 503 mstart = image->segment[i].mem; 504 mend = mstart + image->segment[i].memsz - 1; 505 if ((hole_end >= mstart) && (hole_start <= mend)) { 506 /* Advance the hole to the end of the segment */ 507 hole_start = (mend + (size - 1)) & ~(size - 1); 508 hole_end = hole_start + size - 1; 509 break; 510 } 511 } 512 /* If I don't overlap any segments I have found my hole! */ 513 if (i == image->nr_segments) { 514 pages = pfn_to_page(hole_start >> PAGE_SHIFT); 515 break; 516 } 517 } 518 if (pages) 519 image->control_page = hole_end; 520 521 return pages; 522 } 523 524 525 struct page *kimage_alloc_control_pages(struct kimage *image, 526 unsigned int order) 527 { 528 struct page *pages = NULL; 529 530 switch (image->type) { 531 case KEXEC_TYPE_DEFAULT: 532 pages = kimage_alloc_normal_control_pages(image, order); 533 break; 534 case KEXEC_TYPE_CRASH: 535 pages = kimage_alloc_crash_control_pages(image, order); 536 break; 537 } 538 539 return pages; 540 } 541 542 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) 543 { 544 if (*image->entry != 0) 545 image->entry++; 546 547 if (image->entry == image->last_entry) { 548 kimage_entry_t *ind_page; 549 struct page *page; 550 551 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); 552 if (!page) 553 return -ENOMEM; 554 555 ind_page = page_address(page); 556 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; 557 image->entry = ind_page; 558 image->last_entry = ind_page + 559 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); 560 } 561 *image->entry = entry; 562 image->entry++; 563 *image->entry = 0; 564 565 return 0; 566 } 567 568 static int kimage_set_destination(struct kimage *image, 569 unsigned long destination) 570 { 571 int result; 572 573 destination &= PAGE_MASK; 574 result = kimage_add_entry(image, destination | IND_DESTINATION); 575 if (result == 0) 576 image->destination = destination; 577 578 return result; 579 } 580 581 582 static int kimage_add_page(struct kimage *image, unsigned long page) 583 { 584 int result; 585 586 page &= PAGE_MASK; 587 result = kimage_add_entry(image, page | IND_SOURCE); 588 if (result == 0) 589 image->destination += PAGE_SIZE; 590 591 return result; 592 } 593 594 595 static void kimage_free_extra_pages(struct kimage *image) 596 { 597 /* Walk through and free any extra destination pages I may have */ 598 kimage_free_page_list(&image->dest_pages); 599 600 /* Walk through and free any unuseable pages I have cached */ 601 kimage_free_page_list(&image->unuseable_pages); 602 603 } 604 static void kimage_terminate(struct kimage *image) 605 { 606 if (*image->entry != 0) 607 image->entry++; 608 609 *image->entry = IND_DONE; 610 } 611 612 #define for_each_kimage_entry(image, ptr, entry) \ 613 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 614 ptr = (entry & IND_INDIRECTION)? \ 615 phys_to_virt((entry & PAGE_MASK)): ptr +1) 616 617 static void kimage_free_entry(kimage_entry_t entry) 618 { 619 struct page *page; 620 621 page = pfn_to_page(entry >> PAGE_SHIFT); 622 kimage_free_pages(page); 623 } 624 625 static void kimage_free(struct kimage *image) 626 { 627 kimage_entry_t *ptr, entry; 628 kimage_entry_t ind = 0; 629 630 if (!image) 631 return; 632 633 kimage_free_extra_pages(image); 634 for_each_kimage_entry(image, ptr, entry) { 635 if (entry & IND_INDIRECTION) { 636 /* Free the previous indirection page */ 637 if (ind & IND_INDIRECTION) 638 kimage_free_entry(ind); 639 /* Save this indirection page until we are 640 * done with it. 641 */ 642 ind = entry; 643 } 644 else if (entry & IND_SOURCE) 645 kimage_free_entry(entry); 646 } 647 /* Free the final indirection page */ 648 if (ind & IND_INDIRECTION) 649 kimage_free_entry(ind); 650 651 /* Handle any machine specific cleanup */ 652 machine_kexec_cleanup(image); 653 654 /* Free the kexec control pages... */ 655 kimage_free_page_list(&image->control_pages); 656 kfree(image); 657 } 658 659 static kimage_entry_t *kimage_dst_used(struct kimage *image, 660 unsigned long page) 661 { 662 kimage_entry_t *ptr, entry; 663 unsigned long destination = 0; 664 665 for_each_kimage_entry(image, ptr, entry) { 666 if (entry & IND_DESTINATION) 667 destination = entry & PAGE_MASK; 668 else if (entry & IND_SOURCE) { 669 if (page == destination) 670 return ptr; 671 destination += PAGE_SIZE; 672 } 673 } 674 675 return NULL; 676 } 677 678 static struct page *kimage_alloc_page(struct kimage *image, 679 gfp_t gfp_mask, 680 unsigned long destination) 681 { 682 /* 683 * Here we implement safeguards to ensure that a source page 684 * is not copied to its destination page before the data on 685 * the destination page is no longer useful. 686 * 687 * To do this we maintain the invariant that a source page is 688 * either its own destination page, or it is not a 689 * destination page at all. 690 * 691 * That is slightly stronger than required, but the proof 692 * that no problems will not occur is trivial, and the 693 * implementation is simply to verify. 694 * 695 * When allocating all pages normally this algorithm will run 696 * in O(N) time, but in the worst case it will run in O(N^2) 697 * time. If the runtime is a problem the data structures can 698 * be fixed. 699 */ 700 struct page *page; 701 unsigned long addr; 702 703 /* 704 * Walk through the list of destination pages, and see if I 705 * have a match. 706 */ 707 list_for_each_entry(page, &image->dest_pages, lru) { 708 addr = page_to_pfn(page) << PAGE_SHIFT; 709 if (addr == destination) { 710 list_del(&page->lru); 711 return page; 712 } 713 } 714 page = NULL; 715 while (1) { 716 kimage_entry_t *old; 717 718 /* Allocate a page, if we run out of memory give up */ 719 page = kimage_alloc_pages(gfp_mask, 0); 720 if (!page) 721 return NULL; 722 /* If the page cannot be used file it away */ 723 if (page_to_pfn(page) > 724 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { 725 list_add(&page->lru, &image->unuseable_pages); 726 continue; 727 } 728 addr = page_to_pfn(page) << PAGE_SHIFT; 729 730 /* If it is the destination page we want use it */ 731 if (addr == destination) 732 break; 733 734 /* If the page is not a destination page use it */ 735 if (!kimage_is_destination_range(image, addr, 736 addr + PAGE_SIZE)) 737 break; 738 739 /* 740 * I know that the page is someones destination page. 741 * See if there is already a source page for this 742 * destination page. And if so swap the source pages. 743 */ 744 old = kimage_dst_used(image, addr); 745 if (old) { 746 /* If so move it */ 747 unsigned long old_addr; 748 struct page *old_page; 749 750 old_addr = *old & PAGE_MASK; 751 old_page = pfn_to_page(old_addr >> PAGE_SHIFT); 752 copy_highpage(page, old_page); 753 *old = addr | (*old & ~PAGE_MASK); 754 755 /* The old page I have found cannot be a 756 * destination page, so return it if it's 757 * gfp_flags honor the ones passed in. 758 */ 759 if (!(gfp_mask & __GFP_HIGHMEM) && 760 PageHighMem(old_page)) { 761 kimage_free_pages(old_page); 762 continue; 763 } 764 addr = old_addr; 765 page = old_page; 766 break; 767 } 768 else { 769 /* Place the page on the destination list I 770 * will use it later. 771 */ 772 list_add(&page->lru, &image->dest_pages); 773 } 774 } 775 776 return page; 777 } 778 779 static int kimage_load_normal_segment(struct kimage *image, 780 struct kexec_segment *segment) 781 { 782 unsigned long maddr; 783 unsigned long ubytes, mbytes; 784 int result; 785 unsigned char __user *buf; 786 787 result = 0; 788 buf = segment->buf; 789 ubytes = segment->bufsz; 790 mbytes = segment->memsz; 791 maddr = segment->mem; 792 793 result = kimage_set_destination(image, maddr); 794 if (result < 0) 795 goto out; 796 797 while (mbytes) { 798 struct page *page; 799 char *ptr; 800 size_t uchunk, mchunk; 801 802 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); 803 if (!page) { 804 result = -ENOMEM; 805 goto out; 806 } 807 result = kimage_add_page(image, page_to_pfn(page) 808 << PAGE_SHIFT); 809 if (result < 0) 810 goto out; 811 812 ptr = kmap(page); 813 /* Start with a clear page */ 814 memset(ptr, 0, PAGE_SIZE); 815 ptr += maddr & ~PAGE_MASK; 816 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 817 if (mchunk > mbytes) 818 mchunk = mbytes; 819 820 uchunk = mchunk; 821 if (uchunk > ubytes) 822 uchunk = ubytes; 823 824 result = copy_from_user(ptr, buf, uchunk); 825 kunmap(page); 826 if (result) { 827 result = (result < 0) ? result : -EIO; 828 goto out; 829 } 830 ubytes -= uchunk; 831 maddr += mchunk; 832 buf += mchunk; 833 mbytes -= mchunk; 834 } 835 out: 836 return result; 837 } 838 839 static int kimage_load_crash_segment(struct kimage *image, 840 struct kexec_segment *segment) 841 { 842 /* For crash dumps kernels we simply copy the data from 843 * user space to it's destination. 844 * We do things a page at a time for the sake of kmap. 845 */ 846 unsigned long maddr; 847 unsigned long ubytes, mbytes; 848 int result; 849 unsigned char __user *buf; 850 851 result = 0; 852 buf = segment->buf; 853 ubytes = segment->bufsz; 854 mbytes = segment->memsz; 855 maddr = segment->mem; 856 while (mbytes) { 857 struct page *page; 858 char *ptr; 859 size_t uchunk, mchunk; 860 861 page = pfn_to_page(maddr >> PAGE_SHIFT); 862 if (!page) { 863 result = -ENOMEM; 864 goto out; 865 } 866 ptr = kmap(page); 867 ptr += maddr & ~PAGE_MASK; 868 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 869 if (mchunk > mbytes) 870 mchunk = mbytes; 871 872 uchunk = mchunk; 873 if (uchunk > ubytes) { 874 uchunk = ubytes; 875 /* Zero the trailing part of the page */ 876 memset(ptr + uchunk, 0, mchunk - uchunk); 877 } 878 result = copy_from_user(ptr, buf, uchunk); 879 kexec_flush_icache_page(page); 880 kunmap(page); 881 if (result) { 882 result = (result < 0) ? result : -EIO; 883 goto out; 884 } 885 ubytes -= uchunk; 886 maddr += mchunk; 887 buf += mchunk; 888 mbytes -= mchunk; 889 } 890 out: 891 return result; 892 } 893 894 static int kimage_load_segment(struct kimage *image, 895 struct kexec_segment *segment) 896 { 897 int result = -ENOMEM; 898 899 switch (image->type) { 900 case KEXEC_TYPE_DEFAULT: 901 result = kimage_load_normal_segment(image, segment); 902 break; 903 case KEXEC_TYPE_CRASH: 904 result = kimage_load_crash_segment(image, segment); 905 break; 906 } 907 908 return result; 909 } 910 911 /* 912 * Exec Kernel system call: for obvious reasons only root may call it. 913 * 914 * This call breaks up into three pieces. 915 * - A generic part which loads the new kernel from the current 916 * address space, and very carefully places the data in the 917 * allocated pages. 918 * 919 * - A generic part that interacts with the kernel and tells all of 920 * the devices to shut down. Preventing on-going dmas, and placing 921 * the devices in a consistent state so a later kernel can 922 * reinitialize them. 923 * 924 * - A machine specific part that includes the syscall number 925 * and the copies the image to it's final destination. And 926 * jumps into the image at entry. 927 * 928 * kexec does not sync, or unmount filesystems so if you need 929 * that to happen you need to do that yourself. 930 */ 931 struct kimage *kexec_image; 932 struct kimage *kexec_crash_image; 933 934 static DEFINE_MUTEX(kexec_mutex); 935 936 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 937 struct kexec_segment __user *segments, 938 unsigned long flags) 939 { 940 struct kimage **dest_image, *image; 941 int result; 942 943 /* We only trust the superuser with rebooting the system. */ 944 if (!capable(CAP_SYS_BOOT)) 945 return -EPERM; 946 947 /* 948 * Verify we have a legal set of flags 949 * This leaves us room for future extensions. 950 */ 951 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) 952 return -EINVAL; 953 954 /* Verify we are on the appropriate architecture */ 955 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && 956 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) 957 return -EINVAL; 958 959 /* Put an artificial cap on the number 960 * of segments passed to kexec_load. 961 */ 962 if (nr_segments > KEXEC_SEGMENT_MAX) 963 return -EINVAL; 964 965 image = NULL; 966 result = 0; 967 968 /* Because we write directly to the reserved memory 969 * region when loading crash kernels we need a mutex here to 970 * prevent multiple crash kernels from attempting to load 971 * simultaneously, and to prevent a crash kernel from loading 972 * over the top of a in use crash kernel. 973 * 974 * KISS: always take the mutex. 975 */ 976 if (!mutex_trylock(&kexec_mutex)) 977 return -EBUSY; 978 979 dest_image = &kexec_image; 980 if (flags & KEXEC_ON_CRASH) 981 dest_image = &kexec_crash_image; 982 if (nr_segments > 0) { 983 unsigned long i; 984 985 /* Loading another kernel to reboot into */ 986 if ((flags & KEXEC_ON_CRASH) == 0) 987 result = kimage_normal_alloc(&image, entry, 988 nr_segments, segments); 989 /* Loading another kernel to switch to if this one crashes */ 990 else if (flags & KEXEC_ON_CRASH) { 991 /* Free any current crash dump kernel before 992 * we corrupt it. 993 */ 994 kimage_free(xchg(&kexec_crash_image, NULL)); 995 result = kimage_crash_alloc(&image, entry, 996 nr_segments, segments); 997 } 998 if (result) 999 goto out; 1000 1001 if (flags & KEXEC_PRESERVE_CONTEXT) 1002 image->preserve_context = 1; 1003 result = machine_kexec_prepare(image); 1004 if (result) 1005 goto out; 1006 1007 for (i = 0; i < nr_segments; i++) { 1008 result = kimage_load_segment(image, &image->segment[i]); 1009 if (result) 1010 goto out; 1011 } 1012 kimage_terminate(image); 1013 } 1014 /* Install the new kernel, and Uninstall the old */ 1015 image = xchg(dest_image, image); 1016 1017 out: 1018 mutex_unlock(&kexec_mutex); 1019 kimage_free(image); 1020 1021 return result; 1022 } 1023 1024 #ifdef CONFIG_COMPAT 1025 asmlinkage long compat_sys_kexec_load(unsigned long entry, 1026 unsigned long nr_segments, 1027 struct compat_kexec_segment __user *segments, 1028 unsigned long flags) 1029 { 1030 struct compat_kexec_segment in; 1031 struct kexec_segment out, __user *ksegments; 1032 unsigned long i, result; 1033 1034 /* Don't allow clients that don't understand the native 1035 * architecture to do anything. 1036 */ 1037 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) 1038 return -EINVAL; 1039 1040 if (nr_segments > KEXEC_SEGMENT_MAX) 1041 return -EINVAL; 1042 1043 ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); 1044 for (i=0; i < nr_segments; i++) { 1045 result = copy_from_user(&in, &segments[i], sizeof(in)); 1046 if (result) 1047 return -EFAULT; 1048 1049 out.buf = compat_ptr(in.buf); 1050 out.bufsz = in.bufsz; 1051 out.mem = in.mem; 1052 out.memsz = in.memsz; 1053 1054 result = copy_to_user(&ksegments[i], &out, sizeof(out)); 1055 if (result) 1056 return -EFAULT; 1057 } 1058 1059 return sys_kexec_load(entry, nr_segments, ksegments, flags); 1060 } 1061 #endif 1062 1063 void crash_kexec(struct pt_regs *regs) 1064 { 1065 /* Take the kexec_mutex here to prevent sys_kexec_load 1066 * running on one cpu from replacing the crash kernel 1067 * we are using after a panic on a different cpu. 1068 * 1069 * If the crash kernel was not located in a fixed area 1070 * of memory the xchg(&kexec_crash_image) would be 1071 * sufficient. But since I reuse the memory... 1072 */ 1073 if (mutex_trylock(&kexec_mutex)) { 1074 if (kexec_crash_image) { 1075 struct pt_regs fixed_regs; 1076 crash_setup_regs(&fixed_regs, regs); 1077 crash_save_vmcoreinfo(); 1078 machine_crash_shutdown(&fixed_regs); 1079 machine_kexec(kexec_crash_image); 1080 } 1081 mutex_unlock(&kexec_mutex); 1082 } 1083 } 1084 1085 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, 1086 size_t data_len) 1087 { 1088 struct elf_note note; 1089 1090 note.n_namesz = strlen(name) + 1; 1091 note.n_descsz = data_len; 1092 note.n_type = type; 1093 memcpy(buf, ¬e, sizeof(note)); 1094 buf += (sizeof(note) + 3)/4; 1095 memcpy(buf, name, note.n_namesz); 1096 buf += (note.n_namesz + 3)/4; 1097 memcpy(buf, data, note.n_descsz); 1098 buf += (note.n_descsz + 3)/4; 1099 1100 return buf; 1101 } 1102 1103 static void final_note(u32 *buf) 1104 { 1105 struct elf_note note; 1106 1107 note.n_namesz = 0; 1108 note.n_descsz = 0; 1109 note.n_type = 0; 1110 memcpy(buf, ¬e, sizeof(note)); 1111 } 1112 1113 void crash_save_cpu(struct pt_regs *regs, int cpu) 1114 { 1115 struct elf_prstatus prstatus; 1116 u32 *buf; 1117 1118 if ((cpu < 0) || (cpu >= NR_CPUS)) 1119 return; 1120 1121 /* Using ELF notes here is opportunistic. 1122 * I need a well defined structure format 1123 * for the data I pass, and I need tags 1124 * on the data to indicate what information I have 1125 * squirrelled away. ELF notes happen to provide 1126 * all of that, so there is no need to invent something new. 1127 */ 1128 buf = (u32*)per_cpu_ptr(crash_notes, cpu); 1129 if (!buf) 1130 return; 1131 memset(&prstatus, 0, sizeof(prstatus)); 1132 prstatus.pr_pid = current->pid; 1133 elf_core_copy_regs(&prstatus.pr_reg, regs); 1134 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, 1135 &prstatus, sizeof(prstatus)); 1136 final_note(buf); 1137 } 1138 1139 static int __init crash_notes_memory_init(void) 1140 { 1141 /* Allocate memory for saving cpu registers. */ 1142 crash_notes = alloc_percpu(note_buf_t); 1143 if (!crash_notes) { 1144 printk("Kexec: Memory allocation for saving cpu register" 1145 " states failed\n"); 1146 return -ENOMEM; 1147 } 1148 return 0; 1149 } 1150 module_init(crash_notes_memory_init) 1151 1152 1153 /* 1154 * parsing the "crashkernel" commandline 1155 * 1156 * this code is intended to be called from architecture specific code 1157 */ 1158 1159 1160 /* 1161 * This function parses command lines in the format 1162 * 1163 * crashkernel=ramsize-range:size[,...][@offset] 1164 * 1165 * The function returns 0 on success and -EINVAL on failure. 1166 */ 1167 static int __init parse_crashkernel_mem(char *cmdline, 1168 unsigned long long system_ram, 1169 unsigned long long *crash_size, 1170 unsigned long long *crash_base) 1171 { 1172 char *cur = cmdline, *tmp; 1173 1174 /* for each entry of the comma-separated list */ 1175 do { 1176 unsigned long long start, end = ULLONG_MAX, size; 1177 1178 /* get the start of the range */ 1179 start = memparse(cur, &tmp); 1180 if (cur == tmp) { 1181 pr_warning("crashkernel: Memory value expected\n"); 1182 return -EINVAL; 1183 } 1184 cur = tmp; 1185 if (*cur != '-') { 1186 pr_warning("crashkernel: '-' expected\n"); 1187 return -EINVAL; 1188 } 1189 cur++; 1190 1191 /* if no ':' is here, than we read the end */ 1192 if (*cur != ':') { 1193 end = memparse(cur, &tmp); 1194 if (cur == tmp) { 1195 pr_warning("crashkernel: Memory " 1196 "value expected\n"); 1197 return -EINVAL; 1198 } 1199 cur = tmp; 1200 if (end <= start) { 1201 pr_warning("crashkernel: end <= start\n"); 1202 return -EINVAL; 1203 } 1204 } 1205 1206 if (*cur != ':') { 1207 pr_warning("crashkernel: ':' expected\n"); 1208 return -EINVAL; 1209 } 1210 cur++; 1211 1212 size = memparse(cur, &tmp); 1213 if (cur == tmp) { 1214 pr_warning("Memory value expected\n"); 1215 return -EINVAL; 1216 } 1217 cur = tmp; 1218 if (size >= system_ram) { 1219 pr_warning("crashkernel: invalid size\n"); 1220 return -EINVAL; 1221 } 1222 1223 /* match ? */ 1224 if (system_ram >= start && system_ram < end) { 1225 *crash_size = size; 1226 break; 1227 } 1228 } while (*cur++ == ','); 1229 1230 if (*crash_size > 0) { 1231 while (*cur != ' ' && *cur != '@') 1232 cur++; 1233 if (*cur == '@') { 1234 cur++; 1235 *crash_base = memparse(cur, &tmp); 1236 if (cur == tmp) { 1237 pr_warning("Memory value expected " 1238 "after '@'\n"); 1239 return -EINVAL; 1240 } 1241 } 1242 } 1243 1244 return 0; 1245 } 1246 1247 /* 1248 * That function parses "simple" (old) crashkernel command lines like 1249 * 1250 * crashkernel=size[@offset] 1251 * 1252 * It returns 0 on success and -EINVAL on failure. 1253 */ 1254 static int __init parse_crashkernel_simple(char *cmdline, 1255 unsigned long long *crash_size, 1256 unsigned long long *crash_base) 1257 { 1258 char *cur = cmdline; 1259 1260 *crash_size = memparse(cmdline, &cur); 1261 if (cmdline == cur) { 1262 pr_warning("crashkernel: memory value expected\n"); 1263 return -EINVAL; 1264 } 1265 1266 if (*cur == '@') 1267 *crash_base = memparse(cur+1, &cur); 1268 1269 return 0; 1270 } 1271 1272 /* 1273 * That function is the entry point for command line parsing and should be 1274 * called from the arch-specific code. 1275 */ 1276 int __init parse_crashkernel(char *cmdline, 1277 unsigned long long system_ram, 1278 unsigned long long *crash_size, 1279 unsigned long long *crash_base) 1280 { 1281 char *p = cmdline, *ck_cmdline = NULL; 1282 char *first_colon, *first_space; 1283 1284 BUG_ON(!crash_size || !crash_base); 1285 *crash_size = 0; 1286 *crash_base = 0; 1287 1288 /* find crashkernel and use the last one if there are more */ 1289 p = strstr(p, "crashkernel="); 1290 while (p) { 1291 ck_cmdline = p; 1292 p = strstr(p+1, "crashkernel="); 1293 } 1294 1295 if (!ck_cmdline) 1296 return -EINVAL; 1297 1298 ck_cmdline += 12; /* strlen("crashkernel=") */ 1299 1300 /* 1301 * if the commandline contains a ':', then that's the extended 1302 * syntax -- if not, it must be the classic syntax 1303 */ 1304 first_colon = strchr(ck_cmdline, ':'); 1305 first_space = strchr(ck_cmdline, ' '); 1306 if (first_colon && (!first_space || first_colon < first_space)) 1307 return parse_crashkernel_mem(ck_cmdline, system_ram, 1308 crash_size, crash_base); 1309 else 1310 return parse_crashkernel_simple(ck_cmdline, crash_size, 1311 crash_base); 1312 1313 return 0; 1314 } 1315 1316 1317 1318 void crash_save_vmcoreinfo(void) 1319 { 1320 u32 *buf; 1321 1322 if (!vmcoreinfo_size) 1323 return; 1324 1325 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); 1326 1327 buf = (u32 *)vmcoreinfo_note; 1328 1329 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, 1330 vmcoreinfo_size); 1331 1332 final_note(buf); 1333 } 1334 1335 void vmcoreinfo_append_str(const char *fmt, ...) 1336 { 1337 va_list args; 1338 char buf[0x50]; 1339 int r; 1340 1341 va_start(args, fmt); 1342 r = vsnprintf(buf, sizeof(buf), fmt, args); 1343 va_end(args); 1344 1345 if (r + vmcoreinfo_size > vmcoreinfo_max_size) 1346 r = vmcoreinfo_max_size - vmcoreinfo_size; 1347 1348 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); 1349 1350 vmcoreinfo_size += r; 1351 } 1352 1353 /* 1354 * provide an empty default implementation here -- architecture 1355 * code may override this 1356 */ 1357 void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) 1358 {} 1359 1360 unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) 1361 { 1362 return __pa((unsigned long)(char *)&vmcoreinfo_note); 1363 } 1364 1365 static int __init crash_save_vmcoreinfo_init(void) 1366 { 1367 VMCOREINFO_OSRELEASE(init_uts_ns.name.release); 1368 VMCOREINFO_PAGESIZE(PAGE_SIZE); 1369 1370 VMCOREINFO_SYMBOL(init_uts_ns); 1371 VMCOREINFO_SYMBOL(node_online_map); 1372 VMCOREINFO_SYMBOL(swapper_pg_dir); 1373 VMCOREINFO_SYMBOL(_stext); 1374 1375 #ifndef CONFIG_NEED_MULTIPLE_NODES 1376 VMCOREINFO_SYMBOL(mem_map); 1377 VMCOREINFO_SYMBOL(contig_page_data); 1378 #endif 1379 #ifdef CONFIG_SPARSEMEM 1380 VMCOREINFO_SYMBOL(mem_section); 1381 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); 1382 VMCOREINFO_STRUCT_SIZE(mem_section); 1383 VMCOREINFO_OFFSET(mem_section, section_mem_map); 1384 #endif 1385 VMCOREINFO_STRUCT_SIZE(page); 1386 VMCOREINFO_STRUCT_SIZE(pglist_data); 1387 VMCOREINFO_STRUCT_SIZE(zone); 1388 VMCOREINFO_STRUCT_SIZE(free_area); 1389 VMCOREINFO_STRUCT_SIZE(list_head); 1390 VMCOREINFO_SIZE(nodemask_t); 1391 VMCOREINFO_OFFSET(page, flags); 1392 VMCOREINFO_OFFSET(page, _count); 1393 VMCOREINFO_OFFSET(page, mapping); 1394 VMCOREINFO_OFFSET(page, lru); 1395 VMCOREINFO_OFFSET(pglist_data, node_zones); 1396 VMCOREINFO_OFFSET(pglist_data, nr_zones); 1397 #ifdef CONFIG_FLAT_NODE_MEM_MAP 1398 VMCOREINFO_OFFSET(pglist_data, node_mem_map); 1399 #endif 1400 VMCOREINFO_OFFSET(pglist_data, node_start_pfn); 1401 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); 1402 VMCOREINFO_OFFSET(pglist_data, node_id); 1403 VMCOREINFO_OFFSET(zone, free_area); 1404 VMCOREINFO_OFFSET(zone, vm_stat); 1405 VMCOREINFO_OFFSET(zone, spanned_pages); 1406 VMCOREINFO_OFFSET(free_area, free_list); 1407 VMCOREINFO_OFFSET(list_head, next); 1408 VMCOREINFO_OFFSET(list_head, prev); 1409 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1410 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1411 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1412 VMCOREINFO_NUMBER(PG_lru); 1413 VMCOREINFO_NUMBER(PG_private); 1414 VMCOREINFO_NUMBER(PG_swapcache); 1415 1416 arch_crash_save_vmcoreinfo(); 1417 1418 return 0; 1419 } 1420 1421 module_init(crash_save_vmcoreinfo_init) 1422 1423 /* 1424 * Move into place and start executing a preloaded standalone 1425 * executable. If nothing was preloaded return an error. 1426 */ 1427 int kernel_kexec(void) 1428 { 1429 int error = 0; 1430 1431 if (!mutex_trylock(&kexec_mutex)) 1432 return -EBUSY; 1433 if (!kexec_image) { 1434 error = -EINVAL; 1435 goto Unlock; 1436 } 1437 1438 #ifdef CONFIG_KEXEC_JUMP 1439 if (kexec_image->preserve_context) { 1440 mutex_lock(&pm_mutex); 1441 pm_prepare_console(); 1442 error = freeze_processes(); 1443 if (error) { 1444 error = -EBUSY; 1445 goto Restore_console; 1446 } 1447 suspend_console(); 1448 error = device_suspend(PMSG_FREEZE); 1449 if (error) 1450 goto Resume_console; 1451 error = disable_nonboot_cpus(); 1452 if (error) 1453 goto Resume_devices; 1454 device_pm_lock(); 1455 local_irq_disable(); 1456 /* At this point, device_suspend() has been called, 1457 * but *not* device_power_down(). We *must* 1458 * device_power_down() now. Otherwise, drivers for 1459 * some devices (e.g. interrupt controllers) become 1460 * desynchronized with the actual state of the 1461 * hardware at resume time, and evil weirdness ensues. 1462 */ 1463 error = device_power_down(PMSG_FREEZE); 1464 if (error) 1465 goto Enable_irqs; 1466 } else 1467 #endif 1468 { 1469 kernel_restart_prepare(NULL); 1470 printk(KERN_EMERG "Starting new kernel\n"); 1471 machine_shutdown(); 1472 } 1473 1474 machine_kexec(kexec_image); 1475 1476 #ifdef CONFIG_KEXEC_JUMP 1477 if (kexec_image->preserve_context) { 1478 device_power_up(PMSG_RESTORE); 1479 Enable_irqs: 1480 local_irq_enable(); 1481 device_pm_unlock(); 1482 enable_nonboot_cpus(); 1483 Resume_devices: 1484 device_resume(PMSG_RESTORE); 1485 Resume_console: 1486 resume_console(); 1487 thaw_processes(); 1488 Restore_console: 1489 pm_restore_console(); 1490 mutex_unlock(&pm_mutex); 1491 } 1492 #endif 1493 1494 Unlock: 1495 mutex_unlock(&kexec_mutex); 1496 return error; 1497 } 1498