1 /* 2 * kexec.c - kexec system call 3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 4 * 5 * This source code is licensed under the GNU General Public License, 6 * Version 2. See the file COPYING for more details. 7 */ 8 9 #include <linux/capability.h> 10 #include <linux/mm.h> 11 #include <linux/file.h> 12 #include <linux/slab.h> 13 #include <linux/fs.h> 14 #include <linux/kexec.h> 15 #include <linux/mutex.h> 16 #include <linux/list.h> 17 #include <linux/highmem.h> 18 #include <linux/syscalls.h> 19 #include <linux/reboot.h> 20 #include <linux/ioport.h> 21 #include <linux/hardirq.h> 22 #include <linux/elf.h> 23 #include <linux/elfcore.h> 24 #include <linux/utsrelease.h> 25 #include <linux/utsname.h> 26 #include <linux/numa.h> 27 #include <linux/suspend.h> 28 #include <linux/device.h> 29 #include <linux/freezer.h> 30 #include <linux/pm.h> 31 #include <linux/cpu.h> 32 #include <linux/console.h> 33 #include <linux/vmalloc.h> 34 35 #include <asm/page.h> 36 #include <asm/uaccess.h> 37 #include <asm/io.h> 38 #include <asm/system.h> 39 #include <asm/sections.h> 40 41 /* Per cpu memory for storing cpu states in case of system crash. */ 42 note_buf_t* crash_notes; 43 44 /* vmcoreinfo stuff */ 45 unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 46 u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; 47 size_t vmcoreinfo_size; 48 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); 49 50 /* Location of the reserved area for the crash kernel */ 51 struct resource crashk_res = { 52 .name = "Crash kernel", 53 .start = 0, 54 .end = 0, 55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 56 }; 57 58 int kexec_should_crash(struct task_struct *p) 59 { 60 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) 61 return 1; 62 return 0; 63 } 64 65 /* 66 * When kexec transitions to the new kernel there is a one-to-one 67 * mapping between physical and virtual addresses. On processors 68 * where you can disable the MMU this is trivial, and easy. For 69 * others it is still a simple predictable page table to setup. 70 * 71 * In that environment kexec copies the new kernel to its final 72 * resting place. This means I can only support memory whose 73 * physical address can fit in an unsigned long. In particular 74 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. 75 * If the assembly stub has more restrictive requirements 76 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be 77 * defined more restrictively in <asm/kexec.h>. 78 * 79 * The code for the transition from the current kernel to the 80 * the new kernel is placed in the control_code_buffer, whose size 81 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single 82 * page of memory is necessary, but some architectures require more. 83 * Because this memory must be identity mapped in the transition from 84 * virtual to physical addresses it must live in the range 85 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily 86 * modifiable. 87 * 88 * The assembly stub in the control code buffer is passed a linked list 89 * of descriptor pages detailing the source pages of the new kernel, 90 * and the destination addresses of those source pages. As this data 91 * structure is not used in the context of the current OS, it must 92 * be self-contained. 93 * 94 * The code has been made to work with highmem pages and will use a 95 * destination page in its final resting place (if it happens 96 * to allocate it). The end product of this is that most of the 97 * physical address space, and most of RAM can be used. 98 * 99 * Future directions include: 100 * - allocating a page table with the control code buffer identity 101 * mapped, to simplify machine_kexec and make kexec_on_panic more 102 * reliable. 103 */ 104 105 /* 106 * KIMAGE_NO_DEST is an impossible destination address..., for 107 * allocating pages whose destination address we do not care about. 108 */ 109 #define KIMAGE_NO_DEST (-1UL) 110 111 static int kimage_is_destination_range(struct kimage *image, 112 unsigned long start, unsigned long end); 113 static struct page *kimage_alloc_page(struct kimage *image, 114 gfp_t gfp_mask, 115 unsigned long dest); 116 117 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 118 unsigned long nr_segments, 119 struct kexec_segment __user *segments) 120 { 121 size_t segment_bytes; 122 struct kimage *image; 123 unsigned long i; 124 int result; 125 126 /* Allocate a controlling structure */ 127 result = -ENOMEM; 128 image = kzalloc(sizeof(*image), GFP_KERNEL); 129 if (!image) 130 goto out; 131 132 image->head = 0; 133 image->entry = &image->head; 134 image->last_entry = &image->head; 135 image->control_page = ~0; /* By default this does not apply */ 136 image->start = entry; 137 image->type = KEXEC_TYPE_DEFAULT; 138 139 /* Initialize the list of control pages */ 140 INIT_LIST_HEAD(&image->control_pages); 141 142 /* Initialize the list of destination pages */ 143 INIT_LIST_HEAD(&image->dest_pages); 144 145 /* Initialize the list of unuseable pages */ 146 INIT_LIST_HEAD(&image->unuseable_pages); 147 148 /* Read in the segments */ 149 image->nr_segments = nr_segments; 150 segment_bytes = nr_segments * sizeof(*segments); 151 result = copy_from_user(image->segment, segments, segment_bytes); 152 if (result) 153 goto out; 154 155 /* 156 * Verify we have good destination addresses. The caller is 157 * responsible for making certain we don't attempt to load 158 * the new image into invalid or reserved areas of RAM. This 159 * just verifies it is an address we can use. 160 * 161 * Since the kernel does everything in page size chunks ensure 162 * the destination addreses are page aligned. Too many 163 * special cases crop of when we don't do this. The most 164 * insidious is getting overlapping destination addresses 165 * simply because addresses are changed to page size 166 * granularity. 167 */ 168 result = -EADDRNOTAVAIL; 169 for (i = 0; i < nr_segments; i++) { 170 unsigned long mstart, mend; 171 172 mstart = image->segment[i].mem; 173 mend = mstart + image->segment[i].memsz; 174 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) 175 goto out; 176 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) 177 goto out; 178 } 179 180 /* Verify our destination addresses do not overlap. 181 * If we alloed overlapping destination addresses 182 * through very weird things can happen with no 183 * easy explanation as one segment stops on another. 184 */ 185 result = -EINVAL; 186 for (i = 0; i < nr_segments; i++) { 187 unsigned long mstart, mend; 188 unsigned long j; 189 190 mstart = image->segment[i].mem; 191 mend = mstart + image->segment[i].memsz; 192 for (j = 0; j < i; j++) { 193 unsigned long pstart, pend; 194 pstart = image->segment[j].mem; 195 pend = pstart + image->segment[j].memsz; 196 /* Do the segments overlap ? */ 197 if ((mend > pstart) && (mstart < pend)) 198 goto out; 199 } 200 } 201 202 /* Ensure our buffer sizes are strictly less than 203 * our memory sizes. This should always be the case, 204 * and it is easier to check up front than to be surprised 205 * later on. 206 */ 207 result = -EINVAL; 208 for (i = 0; i < nr_segments; i++) { 209 if (image->segment[i].bufsz > image->segment[i].memsz) 210 goto out; 211 } 212 213 result = 0; 214 out: 215 if (result == 0) 216 *rimage = image; 217 else 218 kfree(image); 219 220 return result; 221 222 } 223 224 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 225 unsigned long nr_segments, 226 struct kexec_segment __user *segments) 227 { 228 int result; 229 struct kimage *image; 230 231 /* Allocate and initialize a controlling structure */ 232 image = NULL; 233 result = do_kimage_alloc(&image, entry, nr_segments, segments); 234 if (result) 235 goto out; 236 237 *rimage = image; 238 239 /* 240 * Find a location for the control code buffer, and add it 241 * the vector of segments so that it's pages will also be 242 * counted as destination pages. 243 */ 244 result = -ENOMEM; 245 image->control_code_page = kimage_alloc_control_pages(image, 246 get_order(KEXEC_CONTROL_PAGE_SIZE)); 247 if (!image->control_code_page) { 248 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 249 goto out; 250 } 251 252 image->swap_page = kimage_alloc_control_pages(image, 0); 253 if (!image->swap_page) { 254 printk(KERN_ERR "Could not allocate swap buffer\n"); 255 goto out; 256 } 257 258 result = 0; 259 out: 260 if (result == 0) 261 *rimage = image; 262 else 263 kfree(image); 264 265 return result; 266 } 267 268 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, 269 unsigned long nr_segments, 270 struct kexec_segment __user *segments) 271 { 272 int result; 273 struct kimage *image; 274 unsigned long i; 275 276 image = NULL; 277 /* Verify we have a valid entry point */ 278 if ((entry < crashk_res.start) || (entry > crashk_res.end)) { 279 result = -EADDRNOTAVAIL; 280 goto out; 281 } 282 283 /* Allocate and initialize a controlling structure */ 284 result = do_kimage_alloc(&image, entry, nr_segments, segments); 285 if (result) 286 goto out; 287 288 /* Enable the special crash kernel control page 289 * allocation policy. 290 */ 291 image->control_page = crashk_res.start; 292 image->type = KEXEC_TYPE_CRASH; 293 294 /* 295 * Verify we have good destination addresses. Normally 296 * the caller is responsible for making certain we don't 297 * attempt to load the new image into invalid or reserved 298 * areas of RAM. But crash kernels are preloaded into a 299 * reserved area of ram. We must ensure the addresses 300 * are in the reserved area otherwise preloading the 301 * kernel could corrupt things. 302 */ 303 result = -EADDRNOTAVAIL; 304 for (i = 0; i < nr_segments; i++) { 305 unsigned long mstart, mend; 306 307 mstart = image->segment[i].mem; 308 mend = mstart + image->segment[i].memsz - 1; 309 /* Ensure we are within the crash kernel limits */ 310 if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 311 goto out; 312 } 313 314 /* 315 * Find a location for the control code buffer, and add 316 * the vector of segments so that it's pages will also be 317 * counted as destination pages. 318 */ 319 result = -ENOMEM; 320 image->control_code_page = kimage_alloc_control_pages(image, 321 get_order(KEXEC_CONTROL_PAGE_SIZE)); 322 if (!image->control_code_page) { 323 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 324 goto out; 325 } 326 327 result = 0; 328 out: 329 if (result == 0) 330 *rimage = image; 331 else 332 kfree(image); 333 334 return result; 335 } 336 337 static int kimage_is_destination_range(struct kimage *image, 338 unsigned long start, 339 unsigned long end) 340 { 341 unsigned long i; 342 343 for (i = 0; i < image->nr_segments; i++) { 344 unsigned long mstart, mend; 345 346 mstart = image->segment[i].mem; 347 mend = mstart + image->segment[i].memsz; 348 if ((end > mstart) && (start < mend)) 349 return 1; 350 } 351 352 return 0; 353 } 354 355 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) 356 { 357 struct page *pages; 358 359 pages = alloc_pages(gfp_mask, order); 360 if (pages) { 361 unsigned int count, i; 362 pages->mapping = NULL; 363 set_page_private(pages, order); 364 count = 1 << order; 365 for (i = 0; i < count; i++) 366 SetPageReserved(pages + i); 367 } 368 369 return pages; 370 } 371 372 static void kimage_free_pages(struct page *page) 373 { 374 unsigned int order, count, i; 375 376 order = page_private(page); 377 count = 1 << order; 378 for (i = 0; i < count; i++) 379 ClearPageReserved(page + i); 380 __free_pages(page, order); 381 } 382 383 static void kimage_free_page_list(struct list_head *list) 384 { 385 struct list_head *pos, *next; 386 387 list_for_each_safe(pos, next, list) { 388 struct page *page; 389 390 page = list_entry(pos, struct page, lru); 391 list_del(&page->lru); 392 kimage_free_pages(page); 393 } 394 } 395 396 static struct page *kimage_alloc_normal_control_pages(struct kimage *image, 397 unsigned int order) 398 { 399 /* Control pages are special, they are the intermediaries 400 * that are needed while we copy the rest of the pages 401 * to their final resting place. As such they must 402 * not conflict with either the destination addresses 403 * or memory the kernel is already using. 404 * 405 * The only case where we really need more than one of 406 * these are for architectures where we cannot disable 407 * the MMU and must instead generate an identity mapped 408 * page table for all of the memory. 409 * 410 * At worst this runs in O(N) of the image size. 411 */ 412 struct list_head extra_pages; 413 struct page *pages; 414 unsigned int count; 415 416 count = 1 << order; 417 INIT_LIST_HEAD(&extra_pages); 418 419 /* Loop while I can allocate a page and the page allocated 420 * is a destination page. 421 */ 422 do { 423 unsigned long pfn, epfn, addr, eaddr; 424 425 pages = kimage_alloc_pages(GFP_KERNEL, order); 426 if (!pages) 427 break; 428 pfn = page_to_pfn(pages); 429 epfn = pfn + count; 430 addr = pfn << PAGE_SHIFT; 431 eaddr = epfn << PAGE_SHIFT; 432 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || 433 kimage_is_destination_range(image, addr, eaddr)) { 434 list_add(&pages->lru, &extra_pages); 435 pages = NULL; 436 } 437 } while (!pages); 438 439 if (pages) { 440 /* Remember the allocated page... */ 441 list_add(&pages->lru, &image->control_pages); 442 443 /* Because the page is already in it's destination 444 * location we will never allocate another page at 445 * that address. Therefore kimage_alloc_pages 446 * will not return it (again) and we don't need 447 * to give it an entry in image->segment[]. 448 */ 449 } 450 /* Deal with the destination pages I have inadvertently allocated. 451 * 452 * Ideally I would convert multi-page allocations into single 453 * page allocations, and add everyting to image->dest_pages. 454 * 455 * For now it is simpler to just free the pages. 456 */ 457 kimage_free_page_list(&extra_pages); 458 459 return pages; 460 } 461 462 static struct page *kimage_alloc_crash_control_pages(struct kimage *image, 463 unsigned int order) 464 { 465 /* Control pages are special, they are the intermediaries 466 * that are needed while we copy the rest of the pages 467 * to their final resting place. As such they must 468 * not conflict with either the destination addresses 469 * or memory the kernel is already using. 470 * 471 * Control pages are also the only pags we must allocate 472 * when loading a crash kernel. All of the other pages 473 * are specified by the segments and we just memcpy 474 * into them directly. 475 * 476 * The only case where we really need more than one of 477 * these are for architectures where we cannot disable 478 * the MMU and must instead generate an identity mapped 479 * page table for all of the memory. 480 * 481 * Given the low demand this implements a very simple 482 * allocator that finds the first hole of the appropriate 483 * size in the reserved memory region, and allocates all 484 * of the memory up to and including the hole. 485 */ 486 unsigned long hole_start, hole_end, size; 487 struct page *pages; 488 489 pages = NULL; 490 size = (1 << order) << PAGE_SHIFT; 491 hole_start = (image->control_page + (size - 1)) & ~(size - 1); 492 hole_end = hole_start + size - 1; 493 while (hole_end <= crashk_res.end) { 494 unsigned long i; 495 496 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) 497 break; 498 if (hole_end > crashk_res.end) 499 break; 500 /* See if I overlap any of the segments */ 501 for (i = 0; i < image->nr_segments; i++) { 502 unsigned long mstart, mend; 503 504 mstart = image->segment[i].mem; 505 mend = mstart + image->segment[i].memsz - 1; 506 if ((hole_end >= mstart) && (hole_start <= mend)) { 507 /* Advance the hole to the end of the segment */ 508 hole_start = (mend + (size - 1)) & ~(size - 1); 509 hole_end = hole_start + size - 1; 510 break; 511 } 512 } 513 /* If I don't overlap any segments I have found my hole! */ 514 if (i == image->nr_segments) { 515 pages = pfn_to_page(hole_start >> PAGE_SHIFT); 516 break; 517 } 518 } 519 if (pages) 520 image->control_page = hole_end; 521 522 return pages; 523 } 524 525 526 struct page *kimage_alloc_control_pages(struct kimage *image, 527 unsigned int order) 528 { 529 struct page *pages = NULL; 530 531 switch (image->type) { 532 case KEXEC_TYPE_DEFAULT: 533 pages = kimage_alloc_normal_control_pages(image, order); 534 break; 535 case KEXEC_TYPE_CRASH: 536 pages = kimage_alloc_crash_control_pages(image, order); 537 break; 538 } 539 540 return pages; 541 } 542 543 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) 544 { 545 if (*image->entry != 0) 546 image->entry++; 547 548 if (image->entry == image->last_entry) { 549 kimage_entry_t *ind_page; 550 struct page *page; 551 552 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); 553 if (!page) 554 return -ENOMEM; 555 556 ind_page = page_address(page); 557 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; 558 image->entry = ind_page; 559 image->last_entry = ind_page + 560 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); 561 } 562 *image->entry = entry; 563 image->entry++; 564 *image->entry = 0; 565 566 return 0; 567 } 568 569 static int kimage_set_destination(struct kimage *image, 570 unsigned long destination) 571 { 572 int result; 573 574 destination &= PAGE_MASK; 575 result = kimage_add_entry(image, destination | IND_DESTINATION); 576 if (result == 0) 577 image->destination = destination; 578 579 return result; 580 } 581 582 583 static int kimage_add_page(struct kimage *image, unsigned long page) 584 { 585 int result; 586 587 page &= PAGE_MASK; 588 result = kimage_add_entry(image, page | IND_SOURCE); 589 if (result == 0) 590 image->destination += PAGE_SIZE; 591 592 return result; 593 } 594 595 596 static void kimage_free_extra_pages(struct kimage *image) 597 { 598 /* Walk through and free any extra destination pages I may have */ 599 kimage_free_page_list(&image->dest_pages); 600 601 /* Walk through and free any unuseable pages I have cached */ 602 kimage_free_page_list(&image->unuseable_pages); 603 604 } 605 static void kimage_terminate(struct kimage *image) 606 { 607 if (*image->entry != 0) 608 image->entry++; 609 610 *image->entry = IND_DONE; 611 } 612 613 #define for_each_kimage_entry(image, ptr, entry) \ 614 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 615 ptr = (entry & IND_INDIRECTION)? \ 616 phys_to_virt((entry & PAGE_MASK)): ptr +1) 617 618 static void kimage_free_entry(kimage_entry_t entry) 619 { 620 struct page *page; 621 622 page = pfn_to_page(entry >> PAGE_SHIFT); 623 kimage_free_pages(page); 624 } 625 626 static void kimage_free(struct kimage *image) 627 { 628 kimage_entry_t *ptr, entry; 629 kimage_entry_t ind = 0; 630 631 if (!image) 632 return; 633 634 kimage_free_extra_pages(image); 635 for_each_kimage_entry(image, ptr, entry) { 636 if (entry & IND_INDIRECTION) { 637 /* Free the previous indirection page */ 638 if (ind & IND_INDIRECTION) 639 kimage_free_entry(ind); 640 /* Save this indirection page until we are 641 * done with it. 642 */ 643 ind = entry; 644 } 645 else if (entry & IND_SOURCE) 646 kimage_free_entry(entry); 647 } 648 /* Free the final indirection page */ 649 if (ind & IND_INDIRECTION) 650 kimage_free_entry(ind); 651 652 /* Handle any machine specific cleanup */ 653 machine_kexec_cleanup(image); 654 655 /* Free the kexec control pages... */ 656 kimage_free_page_list(&image->control_pages); 657 kfree(image); 658 } 659 660 static kimage_entry_t *kimage_dst_used(struct kimage *image, 661 unsigned long page) 662 { 663 kimage_entry_t *ptr, entry; 664 unsigned long destination = 0; 665 666 for_each_kimage_entry(image, ptr, entry) { 667 if (entry & IND_DESTINATION) 668 destination = entry & PAGE_MASK; 669 else if (entry & IND_SOURCE) { 670 if (page == destination) 671 return ptr; 672 destination += PAGE_SIZE; 673 } 674 } 675 676 return NULL; 677 } 678 679 static struct page *kimage_alloc_page(struct kimage *image, 680 gfp_t gfp_mask, 681 unsigned long destination) 682 { 683 /* 684 * Here we implement safeguards to ensure that a source page 685 * is not copied to its destination page before the data on 686 * the destination page is no longer useful. 687 * 688 * To do this we maintain the invariant that a source page is 689 * either its own destination page, or it is not a 690 * destination page at all. 691 * 692 * That is slightly stronger than required, but the proof 693 * that no problems will not occur is trivial, and the 694 * implementation is simply to verify. 695 * 696 * When allocating all pages normally this algorithm will run 697 * in O(N) time, but in the worst case it will run in O(N^2) 698 * time. If the runtime is a problem the data structures can 699 * be fixed. 700 */ 701 struct page *page; 702 unsigned long addr; 703 704 /* 705 * Walk through the list of destination pages, and see if I 706 * have a match. 707 */ 708 list_for_each_entry(page, &image->dest_pages, lru) { 709 addr = page_to_pfn(page) << PAGE_SHIFT; 710 if (addr == destination) { 711 list_del(&page->lru); 712 return page; 713 } 714 } 715 page = NULL; 716 while (1) { 717 kimage_entry_t *old; 718 719 /* Allocate a page, if we run out of memory give up */ 720 page = kimage_alloc_pages(gfp_mask, 0); 721 if (!page) 722 return NULL; 723 /* If the page cannot be used file it away */ 724 if (page_to_pfn(page) > 725 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { 726 list_add(&page->lru, &image->unuseable_pages); 727 continue; 728 } 729 addr = page_to_pfn(page) << PAGE_SHIFT; 730 731 /* If it is the destination page we want use it */ 732 if (addr == destination) 733 break; 734 735 /* If the page is not a destination page use it */ 736 if (!kimage_is_destination_range(image, addr, 737 addr + PAGE_SIZE)) 738 break; 739 740 /* 741 * I know that the page is someones destination page. 742 * See if there is already a source page for this 743 * destination page. And if so swap the source pages. 744 */ 745 old = kimage_dst_used(image, addr); 746 if (old) { 747 /* If so move it */ 748 unsigned long old_addr; 749 struct page *old_page; 750 751 old_addr = *old & PAGE_MASK; 752 old_page = pfn_to_page(old_addr >> PAGE_SHIFT); 753 copy_highpage(page, old_page); 754 *old = addr | (*old & ~PAGE_MASK); 755 756 /* The old page I have found cannot be a 757 * destination page, so return it if it's 758 * gfp_flags honor the ones passed in. 759 */ 760 if (!(gfp_mask & __GFP_HIGHMEM) && 761 PageHighMem(old_page)) { 762 kimage_free_pages(old_page); 763 continue; 764 } 765 addr = old_addr; 766 page = old_page; 767 break; 768 } 769 else { 770 /* Place the page on the destination list I 771 * will use it later. 772 */ 773 list_add(&page->lru, &image->dest_pages); 774 } 775 } 776 777 return page; 778 } 779 780 static int kimage_load_normal_segment(struct kimage *image, 781 struct kexec_segment *segment) 782 { 783 unsigned long maddr; 784 unsigned long ubytes, mbytes; 785 int result; 786 unsigned char __user *buf; 787 788 result = 0; 789 buf = segment->buf; 790 ubytes = segment->bufsz; 791 mbytes = segment->memsz; 792 maddr = segment->mem; 793 794 result = kimage_set_destination(image, maddr); 795 if (result < 0) 796 goto out; 797 798 while (mbytes) { 799 struct page *page; 800 char *ptr; 801 size_t uchunk, mchunk; 802 803 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); 804 if (!page) { 805 result = -ENOMEM; 806 goto out; 807 } 808 result = kimage_add_page(image, page_to_pfn(page) 809 << PAGE_SHIFT); 810 if (result < 0) 811 goto out; 812 813 ptr = kmap(page); 814 /* Start with a clear page */ 815 memset(ptr, 0, PAGE_SIZE); 816 ptr += maddr & ~PAGE_MASK; 817 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 818 if (mchunk > mbytes) 819 mchunk = mbytes; 820 821 uchunk = mchunk; 822 if (uchunk > ubytes) 823 uchunk = ubytes; 824 825 result = copy_from_user(ptr, buf, uchunk); 826 kunmap(page); 827 if (result) { 828 result = (result < 0) ? result : -EIO; 829 goto out; 830 } 831 ubytes -= uchunk; 832 maddr += mchunk; 833 buf += mchunk; 834 mbytes -= mchunk; 835 } 836 out: 837 return result; 838 } 839 840 static int kimage_load_crash_segment(struct kimage *image, 841 struct kexec_segment *segment) 842 { 843 /* For crash dumps kernels we simply copy the data from 844 * user space to it's destination. 845 * We do things a page at a time for the sake of kmap. 846 */ 847 unsigned long maddr; 848 unsigned long ubytes, mbytes; 849 int result; 850 unsigned char __user *buf; 851 852 result = 0; 853 buf = segment->buf; 854 ubytes = segment->bufsz; 855 mbytes = segment->memsz; 856 maddr = segment->mem; 857 while (mbytes) { 858 struct page *page; 859 char *ptr; 860 size_t uchunk, mchunk; 861 862 page = pfn_to_page(maddr >> PAGE_SHIFT); 863 if (!page) { 864 result = -ENOMEM; 865 goto out; 866 } 867 ptr = kmap(page); 868 ptr += maddr & ~PAGE_MASK; 869 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 870 if (mchunk > mbytes) 871 mchunk = mbytes; 872 873 uchunk = mchunk; 874 if (uchunk > ubytes) { 875 uchunk = ubytes; 876 /* Zero the trailing part of the page */ 877 memset(ptr + uchunk, 0, mchunk - uchunk); 878 } 879 result = copy_from_user(ptr, buf, uchunk); 880 kexec_flush_icache_page(page); 881 kunmap(page); 882 if (result) { 883 result = (result < 0) ? result : -EIO; 884 goto out; 885 } 886 ubytes -= uchunk; 887 maddr += mchunk; 888 buf += mchunk; 889 mbytes -= mchunk; 890 } 891 out: 892 return result; 893 } 894 895 static int kimage_load_segment(struct kimage *image, 896 struct kexec_segment *segment) 897 { 898 int result = -ENOMEM; 899 900 switch (image->type) { 901 case KEXEC_TYPE_DEFAULT: 902 result = kimage_load_normal_segment(image, segment); 903 break; 904 case KEXEC_TYPE_CRASH: 905 result = kimage_load_crash_segment(image, segment); 906 break; 907 } 908 909 return result; 910 } 911 912 /* 913 * Exec Kernel system call: for obvious reasons only root may call it. 914 * 915 * This call breaks up into three pieces. 916 * - A generic part which loads the new kernel from the current 917 * address space, and very carefully places the data in the 918 * allocated pages. 919 * 920 * - A generic part that interacts with the kernel and tells all of 921 * the devices to shut down. Preventing on-going dmas, and placing 922 * the devices in a consistent state so a later kernel can 923 * reinitialize them. 924 * 925 * - A machine specific part that includes the syscall number 926 * and the copies the image to it's final destination. And 927 * jumps into the image at entry. 928 * 929 * kexec does not sync, or unmount filesystems so if you need 930 * that to happen you need to do that yourself. 931 */ 932 struct kimage *kexec_image; 933 struct kimage *kexec_crash_image; 934 935 static DEFINE_MUTEX(kexec_mutex); 936 937 SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, 938 struct kexec_segment __user *, segments, unsigned long, flags) 939 { 940 struct kimage **dest_image, *image; 941 int result; 942 943 /* We only trust the superuser with rebooting the system. */ 944 if (!capable(CAP_SYS_BOOT)) 945 return -EPERM; 946 947 /* 948 * Verify we have a legal set of flags 949 * This leaves us room for future extensions. 950 */ 951 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) 952 return -EINVAL; 953 954 /* Verify we are on the appropriate architecture */ 955 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && 956 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) 957 return -EINVAL; 958 959 /* Put an artificial cap on the number 960 * of segments passed to kexec_load. 961 */ 962 if (nr_segments > KEXEC_SEGMENT_MAX) 963 return -EINVAL; 964 965 image = NULL; 966 result = 0; 967 968 /* Because we write directly to the reserved memory 969 * region when loading crash kernels we need a mutex here to 970 * prevent multiple crash kernels from attempting to load 971 * simultaneously, and to prevent a crash kernel from loading 972 * over the top of a in use crash kernel. 973 * 974 * KISS: always take the mutex. 975 */ 976 if (!mutex_trylock(&kexec_mutex)) 977 return -EBUSY; 978 979 dest_image = &kexec_image; 980 if (flags & KEXEC_ON_CRASH) 981 dest_image = &kexec_crash_image; 982 if (nr_segments > 0) { 983 unsigned long i; 984 985 /* Loading another kernel to reboot into */ 986 if ((flags & KEXEC_ON_CRASH) == 0) 987 result = kimage_normal_alloc(&image, entry, 988 nr_segments, segments); 989 /* Loading another kernel to switch to if this one crashes */ 990 else if (flags & KEXEC_ON_CRASH) { 991 /* Free any current crash dump kernel before 992 * we corrupt it. 993 */ 994 kimage_free(xchg(&kexec_crash_image, NULL)); 995 result = kimage_crash_alloc(&image, entry, 996 nr_segments, segments); 997 } 998 if (result) 999 goto out; 1000 1001 if (flags & KEXEC_PRESERVE_CONTEXT) 1002 image->preserve_context = 1; 1003 result = machine_kexec_prepare(image); 1004 if (result) 1005 goto out; 1006 1007 for (i = 0; i < nr_segments; i++) { 1008 result = kimage_load_segment(image, &image->segment[i]); 1009 if (result) 1010 goto out; 1011 } 1012 kimage_terminate(image); 1013 } 1014 /* Install the new kernel, and Uninstall the old */ 1015 image = xchg(dest_image, image); 1016 1017 out: 1018 mutex_unlock(&kexec_mutex); 1019 kimage_free(image); 1020 1021 return result; 1022 } 1023 1024 #ifdef CONFIG_COMPAT 1025 asmlinkage long compat_sys_kexec_load(unsigned long entry, 1026 unsigned long nr_segments, 1027 struct compat_kexec_segment __user *segments, 1028 unsigned long flags) 1029 { 1030 struct compat_kexec_segment in; 1031 struct kexec_segment out, __user *ksegments; 1032 unsigned long i, result; 1033 1034 /* Don't allow clients that don't understand the native 1035 * architecture to do anything. 1036 */ 1037 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) 1038 return -EINVAL; 1039 1040 if (nr_segments > KEXEC_SEGMENT_MAX) 1041 return -EINVAL; 1042 1043 ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); 1044 for (i=0; i < nr_segments; i++) { 1045 result = copy_from_user(&in, &segments[i], sizeof(in)); 1046 if (result) 1047 return -EFAULT; 1048 1049 out.buf = compat_ptr(in.buf); 1050 out.bufsz = in.bufsz; 1051 out.mem = in.mem; 1052 out.memsz = in.memsz; 1053 1054 result = copy_to_user(&ksegments[i], &out, sizeof(out)); 1055 if (result) 1056 return -EFAULT; 1057 } 1058 1059 return sys_kexec_load(entry, nr_segments, ksegments, flags); 1060 } 1061 #endif 1062 1063 void crash_kexec(struct pt_regs *regs) 1064 { 1065 /* Take the kexec_mutex here to prevent sys_kexec_load 1066 * running on one cpu from replacing the crash kernel 1067 * we are using after a panic on a different cpu. 1068 * 1069 * If the crash kernel was not located in a fixed area 1070 * of memory the xchg(&kexec_crash_image) would be 1071 * sufficient. But since I reuse the memory... 1072 */ 1073 if (mutex_trylock(&kexec_mutex)) { 1074 if (kexec_crash_image) { 1075 struct pt_regs fixed_regs; 1076 crash_setup_regs(&fixed_regs, regs); 1077 crash_save_vmcoreinfo(); 1078 machine_crash_shutdown(&fixed_regs); 1079 machine_kexec(kexec_crash_image); 1080 } 1081 mutex_unlock(&kexec_mutex); 1082 } 1083 } 1084 1085 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, 1086 size_t data_len) 1087 { 1088 struct elf_note note; 1089 1090 note.n_namesz = strlen(name) + 1; 1091 note.n_descsz = data_len; 1092 note.n_type = type; 1093 memcpy(buf, ¬e, sizeof(note)); 1094 buf += (sizeof(note) + 3)/4; 1095 memcpy(buf, name, note.n_namesz); 1096 buf += (note.n_namesz + 3)/4; 1097 memcpy(buf, data, note.n_descsz); 1098 buf += (note.n_descsz + 3)/4; 1099 1100 return buf; 1101 } 1102 1103 static void final_note(u32 *buf) 1104 { 1105 struct elf_note note; 1106 1107 note.n_namesz = 0; 1108 note.n_descsz = 0; 1109 note.n_type = 0; 1110 memcpy(buf, ¬e, sizeof(note)); 1111 } 1112 1113 void crash_save_cpu(struct pt_regs *regs, int cpu) 1114 { 1115 struct elf_prstatus prstatus; 1116 u32 *buf; 1117 1118 if ((cpu < 0) || (cpu >= nr_cpu_ids)) 1119 return; 1120 1121 /* Using ELF notes here is opportunistic. 1122 * I need a well defined structure format 1123 * for the data I pass, and I need tags 1124 * on the data to indicate what information I have 1125 * squirrelled away. ELF notes happen to provide 1126 * all of that, so there is no need to invent something new. 1127 */ 1128 buf = (u32*)per_cpu_ptr(crash_notes, cpu); 1129 if (!buf) 1130 return; 1131 memset(&prstatus, 0, sizeof(prstatus)); 1132 prstatus.pr_pid = current->pid; 1133 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); 1134 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, 1135 &prstatus, sizeof(prstatus)); 1136 final_note(buf); 1137 } 1138 1139 static int __init crash_notes_memory_init(void) 1140 { 1141 /* Allocate memory for saving cpu registers. */ 1142 crash_notes = alloc_percpu(note_buf_t); 1143 if (!crash_notes) { 1144 printk("Kexec: Memory allocation for saving cpu register" 1145 " states failed\n"); 1146 return -ENOMEM; 1147 } 1148 return 0; 1149 } 1150 module_init(crash_notes_memory_init) 1151 1152 1153 /* 1154 * parsing the "crashkernel" commandline 1155 * 1156 * this code is intended to be called from architecture specific code 1157 */ 1158 1159 1160 /* 1161 * This function parses command lines in the format 1162 * 1163 * crashkernel=ramsize-range:size[,...][@offset] 1164 * 1165 * The function returns 0 on success and -EINVAL on failure. 1166 */ 1167 static int __init parse_crashkernel_mem(char *cmdline, 1168 unsigned long long system_ram, 1169 unsigned long long *crash_size, 1170 unsigned long long *crash_base) 1171 { 1172 char *cur = cmdline, *tmp; 1173 1174 /* for each entry of the comma-separated list */ 1175 do { 1176 unsigned long long start, end = ULLONG_MAX, size; 1177 1178 /* get the start of the range */ 1179 start = memparse(cur, &tmp); 1180 if (cur == tmp) { 1181 pr_warning("crashkernel: Memory value expected\n"); 1182 return -EINVAL; 1183 } 1184 cur = tmp; 1185 if (*cur != '-') { 1186 pr_warning("crashkernel: '-' expected\n"); 1187 return -EINVAL; 1188 } 1189 cur++; 1190 1191 /* if no ':' is here, than we read the end */ 1192 if (*cur != ':') { 1193 end = memparse(cur, &tmp); 1194 if (cur == tmp) { 1195 pr_warning("crashkernel: Memory " 1196 "value expected\n"); 1197 return -EINVAL; 1198 } 1199 cur = tmp; 1200 if (end <= start) { 1201 pr_warning("crashkernel: end <= start\n"); 1202 return -EINVAL; 1203 } 1204 } 1205 1206 if (*cur != ':') { 1207 pr_warning("crashkernel: ':' expected\n"); 1208 return -EINVAL; 1209 } 1210 cur++; 1211 1212 size = memparse(cur, &tmp); 1213 if (cur == tmp) { 1214 pr_warning("Memory value expected\n"); 1215 return -EINVAL; 1216 } 1217 cur = tmp; 1218 if (size >= system_ram) { 1219 pr_warning("crashkernel: invalid size\n"); 1220 return -EINVAL; 1221 } 1222 1223 /* match ? */ 1224 if (system_ram >= start && system_ram < end) { 1225 *crash_size = size; 1226 break; 1227 } 1228 } while (*cur++ == ','); 1229 1230 if (*crash_size > 0) { 1231 while (*cur != ' ' && *cur != '@') 1232 cur++; 1233 if (*cur == '@') { 1234 cur++; 1235 *crash_base = memparse(cur, &tmp); 1236 if (cur == tmp) { 1237 pr_warning("Memory value expected " 1238 "after '@'\n"); 1239 return -EINVAL; 1240 } 1241 } 1242 } 1243 1244 return 0; 1245 } 1246 1247 /* 1248 * That function parses "simple" (old) crashkernel command lines like 1249 * 1250 * crashkernel=size[@offset] 1251 * 1252 * It returns 0 on success and -EINVAL on failure. 1253 */ 1254 static int __init parse_crashkernel_simple(char *cmdline, 1255 unsigned long long *crash_size, 1256 unsigned long long *crash_base) 1257 { 1258 char *cur = cmdline; 1259 1260 *crash_size = memparse(cmdline, &cur); 1261 if (cmdline == cur) { 1262 pr_warning("crashkernel: memory value expected\n"); 1263 return -EINVAL; 1264 } 1265 1266 if (*cur == '@') 1267 *crash_base = memparse(cur+1, &cur); 1268 1269 return 0; 1270 } 1271 1272 /* 1273 * That function is the entry point for command line parsing and should be 1274 * called from the arch-specific code. 1275 */ 1276 int __init parse_crashkernel(char *cmdline, 1277 unsigned long long system_ram, 1278 unsigned long long *crash_size, 1279 unsigned long long *crash_base) 1280 { 1281 char *p = cmdline, *ck_cmdline = NULL; 1282 char *first_colon, *first_space; 1283 1284 BUG_ON(!crash_size || !crash_base); 1285 *crash_size = 0; 1286 *crash_base = 0; 1287 1288 /* find crashkernel and use the last one if there are more */ 1289 p = strstr(p, "crashkernel="); 1290 while (p) { 1291 ck_cmdline = p; 1292 p = strstr(p+1, "crashkernel="); 1293 } 1294 1295 if (!ck_cmdline) 1296 return -EINVAL; 1297 1298 ck_cmdline += 12; /* strlen("crashkernel=") */ 1299 1300 /* 1301 * if the commandline contains a ':', then that's the extended 1302 * syntax -- if not, it must be the classic syntax 1303 */ 1304 first_colon = strchr(ck_cmdline, ':'); 1305 first_space = strchr(ck_cmdline, ' '); 1306 if (first_colon && (!first_space || first_colon < first_space)) 1307 return parse_crashkernel_mem(ck_cmdline, system_ram, 1308 crash_size, crash_base); 1309 else 1310 return parse_crashkernel_simple(ck_cmdline, crash_size, 1311 crash_base); 1312 1313 return 0; 1314 } 1315 1316 1317 1318 void crash_save_vmcoreinfo(void) 1319 { 1320 u32 *buf; 1321 1322 if (!vmcoreinfo_size) 1323 return; 1324 1325 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); 1326 1327 buf = (u32 *)vmcoreinfo_note; 1328 1329 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, 1330 vmcoreinfo_size); 1331 1332 final_note(buf); 1333 } 1334 1335 void vmcoreinfo_append_str(const char *fmt, ...) 1336 { 1337 va_list args; 1338 char buf[0x50]; 1339 int r; 1340 1341 va_start(args, fmt); 1342 r = vsnprintf(buf, sizeof(buf), fmt, args); 1343 va_end(args); 1344 1345 if (r + vmcoreinfo_size > vmcoreinfo_max_size) 1346 r = vmcoreinfo_max_size - vmcoreinfo_size; 1347 1348 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); 1349 1350 vmcoreinfo_size += r; 1351 } 1352 1353 /* 1354 * provide an empty default implementation here -- architecture 1355 * code may override this 1356 */ 1357 void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) 1358 {} 1359 1360 unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) 1361 { 1362 return __pa((unsigned long)(char *)&vmcoreinfo_note); 1363 } 1364 1365 static int __init crash_save_vmcoreinfo_init(void) 1366 { 1367 VMCOREINFO_OSRELEASE(init_uts_ns.name.release); 1368 VMCOREINFO_PAGESIZE(PAGE_SIZE); 1369 1370 VMCOREINFO_SYMBOL(init_uts_ns); 1371 VMCOREINFO_SYMBOL(node_online_map); 1372 VMCOREINFO_SYMBOL(swapper_pg_dir); 1373 VMCOREINFO_SYMBOL(_stext); 1374 VMCOREINFO_SYMBOL(vmlist); 1375 1376 #ifndef CONFIG_NEED_MULTIPLE_NODES 1377 VMCOREINFO_SYMBOL(mem_map); 1378 VMCOREINFO_SYMBOL(contig_page_data); 1379 #endif 1380 #ifdef CONFIG_SPARSEMEM 1381 VMCOREINFO_SYMBOL(mem_section); 1382 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); 1383 VMCOREINFO_STRUCT_SIZE(mem_section); 1384 VMCOREINFO_OFFSET(mem_section, section_mem_map); 1385 #endif 1386 VMCOREINFO_STRUCT_SIZE(page); 1387 VMCOREINFO_STRUCT_SIZE(pglist_data); 1388 VMCOREINFO_STRUCT_SIZE(zone); 1389 VMCOREINFO_STRUCT_SIZE(free_area); 1390 VMCOREINFO_STRUCT_SIZE(list_head); 1391 VMCOREINFO_SIZE(nodemask_t); 1392 VMCOREINFO_OFFSET(page, flags); 1393 VMCOREINFO_OFFSET(page, _count); 1394 VMCOREINFO_OFFSET(page, mapping); 1395 VMCOREINFO_OFFSET(page, lru); 1396 VMCOREINFO_OFFSET(pglist_data, node_zones); 1397 VMCOREINFO_OFFSET(pglist_data, nr_zones); 1398 #ifdef CONFIG_FLAT_NODE_MEM_MAP 1399 VMCOREINFO_OFFSET(pglist_data, node_mem_map); 1400 #endif 1401 VMCOREINFO_OFFSET(pglist_data, node_start_pfn); 1402 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); 1403 VMCOREINFO_OFFSET(pglist_data, node_id); 1404 VMCOREINFO_OFFSET(zone, free_area); 1405 VMCOREINFO_OFFSET(zone, vm_stat); 1406 VMCOREINFO_OFFSET(zone, spanned_pages); 1407 VMCOREINFO_OFFSET(free_area, free_list); 1408 VMCOREINFO_OFFSET(list_head, next); 1409 VMCOREINFO_OFFSET(list_head, prev); 1410 VMCOREINFO_OFFSET(vm_struct, addr); 1411 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1412 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1413 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1414 VMCOREINFO_NUMBER(PG_lru); 1415 VMCOREINFO_NUMBER(PG_private); 1416 VMCOREINFO_NUMBER(PG_swapcache); 1417 1418 arch_crash_save_vmcoreinfo(); 1419 1420 return 0; 1421 } 1422 1423 module_init(crash_save_vmcoreinfo_init) 1424 1425 /* 1426 * Move into place and start executing a preloaded standalone 1427 * executable. If nothing was preloaded return an error. 1428 */ 1429 int kernel_kexec(void) 1430 { 1431 int error = 0; 1432 1433 if (!mutex_trylock(&kexec_mutex)) 1434 return -EBUSY; 1435 if (!kexec_image) { 1436 error = -EINVAL; 1437 goto Unlock; 1438 } 1439 1440 #ifdef CONFIG_KEXEC_JUMP 1441 if (kexec_image->preserve_context) { 1442 mutex_lock(&pm_mutex); 1443 pm_prepare_console(); 1444 error = freeze_processes(); 1445 if (error) { 1446 error = -EBUSY; 1447 goto Restore_console; 1448 } 1449 suspend_console(); 1450 error = device_suspend(PMSG_FREEZE); 1451 if (error) 1452 goto Resume_console; 1453 device_pm_lock(); 1454 /* At this point, device_suspend() has been called, 1455 * but *not* device_power_down(). We *must* 1456 * device_power_down() now. Otherwise, drivers for 1457 * some devices (e.g. interrupt controllers) become 1458 * desynchronized with the actual state of the 1459 * hardware at resume time, and evil weirdness ensues. 1460 */ 1461 error = device_power_down(PMSG_FREEZE); 1462 if (error) 1463 goto Resume_devices; 1464 error = disable_nonboot_cpus(); 1465 if (error) 1466 goto Enable_cpus; 1467 local_irq_disable(); 1468 /* Suspend system devices */ 1469 error = sysdev_suspend(PMSG_FREEZE); 1470 if (error) 1471 goto Enable_irqs; 1472 } else 1473 #endif 1474 { 1475 kernel_restart_prepare(NULL); 1476 printk(KERN_EMERG "Starting new kernel\n"); 1477 machine_shutdown(); 1478 } 1479 1480 machine_kexec(kexec_image); 1481 1482 #ifdef CONFIG_KEXEC_JUMP 1483 if (kexec_image->preserve_context) { 1484 sysdev_resume(); 1485 Enable_irqs: 1486 local_irq_enable(); 1487 Enable_cpus: 1488 enable_nonboot_cpus(); 1489 device_power_up(PMSG_RESTORE); 1490 Resume_devices: 1491 device_pm_unlock(); 1492 device_resume(PMSG_RESTORE); 1493 Resume_console: 1494 resume_console(); 1495 thaw_processes(); 1496 Restore_console: 1497 pm_restore_console(); 1498 mutex_unlock(&pm_mutex); 1499 } 1500 #endif 1501 1502 Unlock: 1503 mutex_unlock(&kexec_mutex); 1504 return error; 1505 } 1506