1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1995 Linus Torvalds 4 * 5 * This file contains the setup_arch() code, which handles the architecture-dependent 6 * parts of early kernel initialization. 7 */ 8 #include <linux/acpi.h> 9 #include <linux/console.h> 10 #include <linux/crash_dump.h> 11 #include <linux/dma-map-ops.h> 12 #include <linux/efi.h> 13 #include <linux/ima.h> 14 #include <linux/init_ohci1394_dma.h> 15 #include <linux/initrd.h> 16 #include <linux/iscsi_ibft.h> 17 #include <linux/memblock.h> 18 #include <linux/panic_notifier.h> 19 #include <linux/pci.h> 20 #include <linux/root_dev.h> 21 #include <linux/hugetlb.h> 22 #include <linux/tboot.h> 23 #include <linux/usb/xhci-dbgp.h> 24 #include <linux/static_call.h> 25 #include <linux/swiotlb.h> 26 #include <linux/random.h> 27 28 #include <uapi/linux/mount.h> 29 30 #include <xen/xen.h> 31 32 #include <asm/apic.h> 33 #include <asm/efi.h> 34 #include <asm/numa.h> 35 #include <asm/bios_ebda.h> 36 #include <asm/bugs.h> 37 #include <asm/cacheinfo.h> 38 #include <asm/cpu.h> 39 #include <asm/efi.h> 40 #include <asm/gart.h> 41 #include <asm/hypervisor.h> 42 #include <asm/io_apic.h> 43 #include <asm/kasan.h> 44 #include <asm/kaslr.h> 45 #include <asm/mce.h> 46 #include <asm/memtype.h> 47 #include <asm/mtrr.h> 48 #include <asm/realmode.h> 49 #include <asm/olpc_ofw.h> 50 #include <asm/pci-direct.h> 51 #include <asm/prom.h> 52 #include <asm/proto.h> 53 #include <asm/thermal.h> 54 #include <asm/unwind.h> 55 #include <asm/vsyscall.h> 56 #include <linux/vmalloc.h> 57 58 /* 59 * max_low_pfn_mapped: highest directly mapped pfn < 4 GB 60 * max_pfn_mapped: highest directly mapped pfn > 4 GB 61 * 62 * The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are 63 * represented by pfn_mapped[]. 64 */ 65 unsigned long max_low_pfn_mapped; 66 unsigned long max_pfn_mapped; 67 68 #ifdef CONFIG_DMI 69 RESERVE_BRK(dmi_alloc, 65536); 70 #endif 71 72 73 unsigned long _brk_start = (unsigned long)__brk_base; 74 unsigned long _brk_end = (unsigned long)__brk_base; 75 76 struct boot_params boot_params; 77 78 /* 79 * These are the four main kernel memory regions, we put them into 80 * the resource tree so that kdump tools and other debugging tools 81 * recover it: 82 */ 83 84 static struct resource rodata_resource = { 85 .name = "Kernel rodata", 86 .start = 0, 87 .end = 0, 88 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM 89 }; 90 91 static struct resource data_resource = { 92 .name = "Kernel data", 93 .start = 0, 94 .end = 0, 95 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM 96 }; 97 98 static struct resource code_resource = { 99 .name = "Kernel code", 100 .start = 0, 101 .end = 0, 102 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM 103 }; 104 105 static struct resource bss_resource = { 106 .name = "Kernel bss", 107 .start = 0, 108 .end = 0, 109 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM 110 }; 111 112 113 #ifdef CONFIG_X86_32 114 /* CPU data as detected by the assembly code in head_32.S */ 115 struct cpuinfo_x86 new_cpu_data; 116 117 struct apm_info apm_info; 118 EXPORT_SYMBOL(apm_info); 119 120 #if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ 121 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) 122 struct ist_info ist_info; 123 EXPORT_SYMBOL(ist_info); 124 #else 125 struct ist_info ist_info; 126 #endif 127 128 #endif 129 130 struct cpuinfo_x86 boot_cpu_data __read_mostly; 131 EXPORT_SYMBOL(boot_cpu_data); 132 133 #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) 134 __visible unsigned long mmu_cr4_features __ro_after_init; 135 #else 136 __visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE; 137 #endif 138 139 #ifdef CONFIG_IMA 140 static phys_addr_t ima_kexec_buffer_phys; 141 static size_t ima_kexec_buffer_size; 142 #endif 143 144 /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ 145 int bootloader_type, bootloader_version; 146 147 /* 148 * Setup options 149 */ 150 struct screen_info screen_info; 151 EXPORT_SYMBOL(screen_info); 152 struct edid_info edid_info; 153 EXPORT_SYMBOL_GPL(edid_info); 154 155 extern int root_mountflags; 156 157 unsigned long saved_video_mode; 158 159 #define RAMDISK_IMAGE_START_MASK 0x07FF 160 #define RAMDISK_PROMPT_FLAG 0x8000 161 #define RAMDISK_LOAD_FLAG 0x4000 162 163 static char __initdata command_line[COMMAND_LINE_SIZE]; 164 #ifdef CONFIG_CMDLINE_BOOL 165 static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; 166 #endif 167 168 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 169 struct edd edd; 170 #ifdef CONFIG_EDD_MODULE 171 EXPORT_SYMBOL(edd); 172 #endif 173 /** 174 * copy_edd() - Copy the BIOS EDD information 175 * from boot_params into a safe place. 176 * 177 */ 178 static inline void __init copy_edd(void) 179 { 180 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, 181 sizeof(edd.mbr_signature)); 182 memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info)); 183 edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries; 184 edd.edd_info_nr = boot_params.eddbuf_entries; 185 } 186 #else 187 static inline void __init copy_edd(void) 188 { 189 } 190 #endif 191 192 void * __init extend_brk(size_t size, size_t align) 193 { 194 size_t mask = align - 1; 195 void *ret; 196 197 BUG_ON(_brk_start == 0); 198 BUG_ON(align & mask); 199 200 _brk_end = (_brk_end + mask) & ~mask; 201 BUG_ON((char *)(_brk_end + size) > __brk_limit); 202 203 ret = (void *)_brk_end; 204 _brk_end += size; 205 206 memset(ret, 0, size); 207 208 return ret; 209 } 210 211 #ifdef CONFIG_X86_32 212 static void __init cleanup_highmap(void) 213 { 214 } 215 #endif 216 217 static void __init reserve_brk(void) 218 { 219 if (_brk_end > _brk_start) 220 memblock_reserve(__pa_symbol(_brk_start), 221 _brk_end - _brk_start); 222 223 /* Mark brk area as locked down and no longer taking any 224 new allocations */ 225 _brk_start = 0; 226 } 227 228 #ifdef CONFIG_BLK_DEV_INITRD 229 230 static u64 __init get_ramdisk_image(void) 231 { 232 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 233 234 ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32; 235 236 if (ramdisk_image == 0) 237 ramdisk_image = phys_initrd_start; 238 239 return ramdisk_image; 240 } 241 static u64 __init get_ramdisk_size(void) 242 { 243 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 244 245 ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32; 246 247 if (ramdisk_size == 0) 248 ramdisk_size = phys_initrd_size; 249 250 return ramdisk_size; 251 } 252 253 static void __init relocate_initrd(void) 254 { 255 /* Assume only end is not page aligned */ 256 u64 ramdisk_image = get_ramdisk_image(); 257 u64 ramdisk_size = get_ramdisk_size(); 258 u64 area_size = PAGE_ALIGN(ramdisk_size); 259 260 /* We need to move the initrd down into directly mapped mem */ 261 u64 relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0, 262 PFN_PHYS(max_pfn_mapped)); 263 if (!relocated_ramdisk) 264 panic("Cannot find place for new RAMDISK of size %lld\n", 265 ramdisk_size); 266 267 initrd_start = relocated_ramdisk + PAGE_OFFSET; 268 initrd_end = initrd_start + ramdisk_size; 269 printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", 270 relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); 271 272 copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size); 273 274 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" 275 " [mem %#010llx-%#010llx]\n", 276 ramdisk_image, ramdisk_image + ramdisk_size - 1, 277 relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); 278 } 279 280 static void __init early_reserve_initrd(void) 281 { 282 /* Assume only end is not page aligned */ 283 u64 ramdisk_image = get_ramdisk_image(); 284 u64 ramdisk_size = get_ramdisk_size(); 285 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 286 287 if (!boot_params.hdr.type_of_loader || 288 !ramdisk_image || !ramdisk_size) 289 return; /* No initrd provided by bootloader */ 290 291 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); 292 } 293 294 static void __init reserve_initrd(void) 295 { 296 /* Assume only end is not page aligned */ 297 u64 ramdisk_image = get_ramdisk_image(); 298 u64 ramdisk_size = get_ramdisk_size(); 299 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 300 301 if (!boot_params.hdr.type_of_loader || 302 !ramdisk_image || !ramdisk_size) 303 return; /* No initrd provided by bootloader */ 304 305 initrd_start = 0; 306 307 printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, 308 ramdisk_end - 1); 309 310 if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image), 311 PFN_DOWN(ramdisk_end))) { 312 /* All are mapped, easy case */ 313 initrd_start = ramdisk_image + PAGE_OFFSET; 314 initrd_end = initrd_start + ramdisk_size; 315 return; 316 } 317 318 relocate_initrd(); 319 320 memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image); 321 } 322 323 #else 324 static void __init early_reserve_initrd(void) 325 { 326 } 327 static void __init reserve_initrd(void) 328 { 329 } 330 #endif /* CONFIG_BLK_DEV_INITRD */ 331 332 static void __init add_early_ima_buffer(u64 phys_addr) 333 { 334 #ifdef CONFIG_IMA 335 struct ima_setup_data *data; 336 337 data = early_memremap(phys_addr + sizeof(struct setup_data), sizeof(*data)); 338 if (!data) { 339 pr_warn("setup: failed to memremap ima_setup_data entry\n"); 340 return; 341 } 342 343 if (data->size) { 344 memblock_reserve(data->addr, data->size); 345 ima_kexec_buffer_phys = data->addr; 346 ima_kexec_buffer_size = data->size; 347 } 348 349 early_memunmap(data, sizeof(*data)); 350 #else 351 pr_warn("Passed IMA kexec data, but CONFIG_IMA not set. Ignoring.\n"); 352 #endif 353 } 354 355 #if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE) 356 int __init ima_free_kexec_buffer(void) 357 { 358 if (!ima_kexec_buffer_size) 359 return -ENOENT; 360 361 memblock_free_late(ima_kexec_buffer_phys, 362 ima_kexec_buffer_size); 363 364 ima_kexec_buffer_phys = 0; 365 ima_kexec_buffer_size = 0; 366 367 return 0; 368 } 369 370 int __init ima_get_kexec_buffer(void **addr, size_t *size) 371 { 372 if (!ima_kexec_buffer_size) 373 return -ENOENT; 374 375 *addr = __va(ima_kexec_buffer_phys); 376 *size = ima_kexec_buffer_size; 377 378 return 0; 379 } 380 #endif 381 382 static void __init parse_setup_data(void) 383 { 384 struct setup_data *data; 385 u64 pa_data, pa_next; 386 387 pa_data = boot_params.hdr.setup_data; 388 while (pa_data) { 389 u32 data_len, data_type; 390 391 data = early_memremap(pa_data, sizeof(*data)); 392 data_len = data->len + sizeof(struct setup_data); 393 data_type = data->type; 394 pa_next = data->next; 395 early_memunmap(data, sizeof(*data)); 396 397 switch (data_type) { 398 case SETUP_E820_EXT: 399 e820__memory_setup_extended(pa_data, data_len); 400 break; 401 case SETUP_DTB: 402 add_dtb(pa_data); 403 break; 404 case SETUP_EFI: 405 parse_efi_setup(pa_data, data_len); 406 break; 407 case SETUP_IMA: 408 add_early_ima_buffer(pa_data); 409 break; 410 case SETUP_RNG_SEED: 411 data = early_memremap(pa_data, data_len); 412 add_bootloader_randomness(data->data, data->len); 413 /* Zero seed for forward secrecy. */ 414 memzero_explicit(data->data, data->len); 415 /* Zero length in case we find ourselves back here by accident. */ 416 memzero_explicit(&data->len, sizeof(data->len)); 417 early_memunmap(data, data_len); 418 break; 419 default: 420 break; 421 } 422 pa_data = pa_next; 423 } 424 } 425 426 static void __init memblock_x86_reserve_range_setup_data(void) 427 { 428 struct setup_indirect *indirect; 429 struct setup_data *data; 430 u64 pa_data, pa_next; 431 u32 len; 432 433 pa_data = boot_params.hdr.setup_data; 434 while (pa_data) { 435 data = early_memremap(pa_data, sizeof(*data)); 436 if (!data) { 437 pr_warn("setup: failed to memremap setup_data entry\n"); 438 return; 439 } 440 441 len = sizeof(*data); 442 pa_next = data->next; 443 444 memblock_reserve(pa_data, sizeof(*data) + data->len); 445 446 if (data->type == SETUP_INDIRECT) { 447 len += data->len; 448 early_memunmap(data, sizeof(*data)); 449 data = early_memremap(pa_data, len); 450 if (!data) { 451 pr_warn("setup: failed to memremap indirect setup_data\n"); 452 return; 453 } 454 455 indirect = (struct setup_indirect *)data->data; 456 457 if (indirect->type != SETUP_INDIRECT) 458 memblock_reserve(indirect->addr, indirect->len); 459 } 460 461 pa_data = pa_next; 462 early_memunmap(data, len); 463 } 464 } 465 466 static void __init arch_reserve_crashkernel(void) 467 { 468 unsigned long long crash_base, crash_size, low_size = 0; 469 char *cmdline = boot_command_line; 470 bool high = false; 471 int ret; 472 473 if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) 474 return; 475 476 ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), 477 &crash_size, &crash_base, 478 &low_size, &high); 479 if (ret) 480 return; 481 482 if (xen_pv_domain()) { 483 pr_info("Ignoring crashkernel for a Xen PV domain\n"); 484 return; 485 } 486 487 reserve_crashkernel_generic(cmdline, crash_size, crash_base, 488 low_size, high); 489 } 490 491 static struct resource standard_io_resources[] = { 492 { .name = "dma1", .start = 0x00, .end = 0x1f, 493 .flags = IORESOURCE_BUSY | IORESOURCE_IO }, 494 { .name = "pic1", .start = 0x20, .end = 0x21, 495 .flags = IORESOURCE_BUSY | IORESOURCE_IO }, 496 { .name = "timer0", .start = 0x40, .end = 0x43, 497 .flags = IORESOURCE_BUSY | IORESOURCE_IO }, 498 { .name = "timer1", .start = 0x50, .end = 0x53, 499 .flags = IORESOURCE_BUSY | IORESOURCE_IO }, 500 { .name = "keyboard", .start = 0x60, .end = 0x60, 501 .flags = IORESOURCE_BUSY | IORESOURCE_IO }, 502 { .name = "keyboard", .start = 0x64, .end = 0x64, 503 .flags = IORESOURCE_BUSY | IORESOURCE_IO }, 504 { .name = "dma page reg", .start = 0x80, .end = 0x8f, 505 .flags = IORESOURCE_BUSY | IORESOURCE_IO }, 506 { .name = "pic2", .start = 0xa0, .end = 0xa1, 507 .flags = IORESOURCE_BUSY | IORESOURCE_IO }, 508 { .name = "dma2", .start = 0xc0, .end = 0xdf, 509 .flags = IORESOURCE_BUSY | IORESOURCE_IO }, 510 { .name = "fpu", .start = 0xf0, .end = 0xff, 511 .flags = IORESOURCE_BUSY | IORESOURCE_IO } 512 }; 513 514 void __init reserve_standard_io_resources(void) 515 { 516 int i; 517 518 /* request I/O space for devices used on all i[345]86 PCs */ 519 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) 520 request_resource(&ioport_resource, &standard_io_resources[i]); 521 522 } 523 524 static bool __init snb_gfx_workaround_needed(void) 525 { 526 #ifdef CONFIG_PCI 527 int i; 528 u16 vendor, devid; 529 static const __initconst u16 snb_ids[] = { 530 0x0102, 531 0x0112, 532 0x0122, 533 0x0106, 534 0x0116, 535 0x0126, 536 0x010a, 537 }; 538 539 /* Assume no if something weird is going on with PCI */ 540 if (!early_pci_allowed()) 541 return false; 542 543 vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID); 544 if (vendor != 0x8086) 545 return false; 546 547 devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID); 548 for (i = 0; i < ARRAY_SIZE(snb_ids); i++) 549 if (devid == snb_ids[i]) 550 return true; 551 #endif 552 553 return false; 554 } 555 556 /* 557 * Sandy Bridge graphics has trouble with certain ranges, exclude 558 * them from allocation. 559 */ 560 static void __init trim_snb_memory(void) 561 { 562 static const __initconst unsigned long bad_pages[] = { 563 0x20050000, 564 0x20110000, 565 0x20130000, 566 0x20138000, 567 0x40004000, 568 }; 569 int i; 570 571 if (!snb_gfx_workaround_needed()) 572 return; 573 574 printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n"); 575 576 /* 577 * SandyBridge integrated graphics devices have a bug that prevents 578 * them from accessing certain memory ranges, namely anything below 579 * 1M and in the pages listed in bad_pages[] above. 580 * 581 * To avoid these pages being ever accessed by SNB gfx devices reserve 582 * bad_pages that have not already been reserved at boot time. 583 * All memory below the 1 MB mark is anyway reserved later during 584 * setup_arch(), so there is no need to reserve it here. 585 */ 586 587 for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { 588 if (memblock_reserve(bad_pages[i], PAGE_SIZE)) 589 printk(KERN_WARNING "failed to reserve 0x%08lx\n", 590 bad_pages[i]); 591 } 592 } 593 594 static void __init trim_bios_range(void) 595 { 596 /* 597 * A special case is the first 4Kb of memory; 598 * This is a BIOS owned area, not kernel ram, but generally 599 * not listed as such in the E820 table. 600 * 601 * This typically reserves additional memory (64KiB by default) 602 * since some BIOSes are known to corrupt low memory. See the 603 * Kconfig help text for X86_RESERVE_LOW. 604 */ 605 e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); 606 607 /* 608 * special case: Some BIOSes report the PC BIOS 609 * area (640Kb -> 1Mb) as RAM even though it is not. 610 * take them out. 611 */ 612 e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1); 613 614 e820__update_table(e820_table); 615 } 616 617 /* called before trim_bios_range() to spare extra sanitize */ 618 static void __init e820_add_kernel_range(void) 619 { 620 u64 start = __pa_symbol(_text); 621 u64 size = __pa_symbol(_end) - start; 622 623 /* 624 * Complain if .text .data and .bss are not marked as E820_TYPE_RAM and 625 * attempt to fix it by adding the range. We may have a confused BIOS, 626 * or the user may have used memmap=exactmap or memmap=xxM$yyM to 627 * exclude kernel range. If we really are running on top non-RAM, 628 * we will crash later anyways. 629 */ 630 if (e820__mapped_all(start, start + size, E820_TYPE_RAM)) 631 return; 632 633 pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n"); 634 e820__range_remove(start, size, E820_TYPE_RAM, 0); 635 e820__range_add(start, size, E820_TYPE_RAM); 636 } 637 638 static void __init early_reserve_memory(void) 639 { 640 /* 641 * Reserve the memory occupied by the kernel between _text and 642 * __end_of_kernel_reserve symbols. Any kernel sections after the 643 * __end_of_kernel_reserve symbol must be explicitly reserved with a 644 * separate memblock_reserve() or they will be discarded. 645 */ 646 memblock_reserve(__pa_symbol(_text), 647 (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); 648 649 /* 650 * The first 4Kb of memory is a BIOS owned area, but generally it is 651 * not listed as such in the E820 table. 652 * 653 * Reserve the first 64K of memory since some BIOSes are known to 654 * corrupt low memory. After the real mode trampoline is allocated the 655 * rest of the memory below 640k is reserved. 656 * 657 * In addition, make sure page 0 is always reserved because on 658 * systems with L1TF its contents can be leaked to user processes. 659 */ 660 memblock_reserve(0, SZ_64K); 661 662 early_reserve_initrd(); 663 664 memblock_x86_reserve_range_setup_data(); 665 666 reserve_bios_regions(); 667 trim_snb_memory(); 668 } 669 670 /* 671 * Dump out kernel offset information on panic. 672 */ 673 static int 674 dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) 675 { 676 if (kaslr_enabled()) { 677 pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n", 678 kaslr_offset(), 679 __START_KERNEL, 680 __START_KERNEL_map, 681 MODULES_VADDR-1); 682 } else { 683 pr_emerg("Kernel Offset: disabled\n"); 684 } 685 686 return 0; 687 } 688 689 void x86_configure_nx(void) 690 { 691 if (boot_cpu_has(X86_FEATURE_NX)) 692 __supported_pte_mask |= _PAGE_NX; 693 else 694 __supported_pte_mask &= ~_PAGE_NX; 695 } 696 697 static void __init x86_report_nx(void) 698 { 699 if (!boot_cpu_has(X86_FEATURE_NX)) { 700 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " 701 "missing in CPU!\n"); 702 } else { 703 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 704 printk(KERN_INFO "NX (Execute Disable) protection: active\n"); 705 #else 706 /* 32bit non-PAE kernel, NX cannot be used */ 707 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " 708 "cannot be enabled: non-PAE kernel!\n"); 709 #endif 710 } 711 } 712 713 /* 714 * Determine if we were loaded by an EFI loader. If so, then we have also been 715 * passed the efi memmap, systab, etc., so we should use these data structures 716 * for initialization. Note, the efi init code path is determined by the 717 * global efi_enabled. This allows the same kernel image to be used on existing 718 * systems (with a traditional BIOS) as well as on EFI systems. 719 */ 720 /* 721 * setup_arch - architecture-specific boot-time initializations 722 * 723 * Note: On x86_64, fixmaps are ready for use even before this is called. 724 */ 725 726 void __init setup_arch(char **cmdline_p) 727 { 728 #ifdef CONFIG_X86_32 729 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 730 731 /* 732 * copy kernel address range established so far and switch 733 * to the proper swapper page table 734 */ 735 clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY, 736 initial_page_table + KERNEL_PGD_BOUNDARY, 737 KERNEL_PGD_PTRS); 738 739 load_cr3(swapper_pg_dir); 740 /* 741 * Note: Quark X1000 CPUs advertise PGE incorrectly and require 742 * a cr3 based tlb flush, so the following __flush_tlb_all() 743 * will not flush anything because the CPU quirk which clears 744 * X86_FEATURE_PGE has not been invoked yet. Though due to the 745 * load_cr3() above the TLB has been flushed already. The 746 * quirk is invoked before subsequent calls to __flush_tlb_all() 747 * so proper operation is guaranteed. 748 */ 749 __flush_tlb_all(); 750 #else 751 printk(KERN_INFO "Command line: %s\n", boot_command_line); 752 boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS; 753 #endif 754 755 /* 756 * If we have OLPC OFW, we might end up relocating the fixmap due to 757 * reserve_top(), so do this before touching the ioremap area. 758 */ 759 olpc_ofw_detect(); 760 761 idt_setup_early_traps(); 762 early_cpu_init(); 763 jump_label_init(); 764 static_call_init(); 765 early_ioremap_init(); 766 767 setup_olpc_ofw_pgd(); 768 769 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 770 screen_info = boot_params.screen_info; 771 edid_info = boot_params.edid_info; 772 #ifdef CONFIG_X86_32 773 apm_info.bios = boot_params.apm_bios_info; 774 ist_info = boot_params.ist_info; 775 #endif 776 saved_video_mode = boot_params.hdr.vid_mode; 777 bootloader_type = boot_params.hdr.type_of_loader; 778 if ((bootloader_type >> 4) == 0xe) { 779 bootloader_type &= 0xf; 780 bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; 781 } 782 bootloader_version = bootloader_type & 0xf; 783 bootloader_version |= boot_params.hdr.ext_loader_ver << 4; 784 785 #ifdef CONFIG_BLK_DEV_RAM 786 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; 787 #endif 788 #ifdef CONFIG_EFI 789 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, 790 EFI32_LOADER_SIGNATURE, 4)) { 791 set_bit(EFI_BOOT, &efi.flags); 792 } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, 793 EFI64_LOADER_SIGNATURE, 4)) { 794 set_bit(EFI_BOOT, &efi.flags); 795 set_bit(EFI_64BIT, &efi.flags); 796 } 797 #endif 798 799 x86_init.oem.arch_setup(); 800 801 /* 802 * Do some memory reservations *before* memory is added to memblock, so 803 * memblock allocations won't overwrite it. 804 * 805 * After this point, everything still needed from the boot loader or 806 * firmware or kernel text should be early reserved or marked not RAM in 807 * e820. All other memory is free game. 808 * 809 * This call needs to happen before e820__memory_setup() which calls the 810 * xen_memory_setup() on Xen dom0 which relies on the fact that those 811 * early reservations have happened already. 812 */ 813 early_reserve_memory(); 814 815 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; 816 e820__memory_setup(); 817 parse_setup_data(); 818 819 copy_edd(); 820 821 if (!boot_params.hdr.root_flags) 822 root_mountflags &= ~MS_RDONLY; 823 setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end); 824 825 code_resource.start = __pa_symbol(_text); 826 code_resource.end = __pa_symbol(_etext)-1; 827 rodata_resource.start = __pa_symbol(__start_rodata); 828 rodata_resource.end = __pa_symbol(__end_rodata)-1; 829 data_resource.start = __pa_symbol(_sdata); 830 data_resource.end = __pa_symbol(_edata)-1; 831 bss_resource.start = __pa_symbol(__bss_start); 832 bss_resource.end = __pa_symbol(__bss_stop)-1; 833 834 #ifdef CONFIG_CMDLINE_BOOL 835 #ifdef CONFIG_CMDLINE_OVERRIDE 836 strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); 837 #else 838 if (builtin_cmdline[0]) { 839 /* append boot loader cmdline to builtin */ 840 strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); 841 strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); 842 strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); 843 } 844 #endif 845 #endif 846 847 strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 848 *cmdline_p = command_line; 849 850 /* 851 * x86_configure_nx() is called before parse_early_param() to detect 852 * whether hardware doesn't support NX (so that the early EHCI debug 853 * console setup can safely call set_fixmap()). 854 */ 855 x86_configure_nx(); 856 857 parse_early_param(); 858 859 if (efi_enabled(EFI_BOOT)) 860 efi_memblock_x86_reserve_range(); 861 862 #ifdef CONFIG_MEMORY_HOTPLUG 863 /* 864 * Memory used by the kernel cannot be hot-removed because Linux 865 * cannot migrate the kernel pages. When memory hotplug is 866 * enabled, we should prevent memblock from allocating memory 867 * for the kernel. 868 * 869 * ACPI SRAT records all hotpluggable memory ranges. But before 870 * SRAT is parsed, we don't know about it. 871 * 872 * The kernel image is loaded into memory at very early time. We 873 * cannot prevent this anyway. So on NUMA system, we set any 874 * node the kernel resides in as un-hotpluggable. 875 * 876 * Since on modern servers, one node could have double-digit 877 * gigabytes memory, we can assume the memory around the kernel 878 * image is also un-hotpluggable. So before SRAT is parsed, just 879 * allocate memory near the kernel image to try the best to keep 880 * the kernel away from hotpluggable memory. 881 */ 882 if (movable_node_is_enabled()) 883 memblock_set_bottom_up(true); 884 #endif 885 886 x86_report_nx(); 887 888 apic_setup_apic_calls(); 889 890 if (acpi_mps_check()) { 891 #ifdef CONFIG_X86_LOCAL_APIC 892 apic_is_disabled = true; 893 #endif 894 setup_clear_cpu_cap(X86_FEATURE_APIC); 895 } 896 897 e820__reserve_setup_data(); 898 e820__finish_early_params(); 899 900 if (efi_enabled(EFI_BOOT)) 901 efi_init(); 902 903 reserve_ibft_region(); 904 x86_init.resources.dmi_setup(); 905 906 /* 907 * VMware detection requires dmi to be available, so this 908 * needs to be done after dmi_setup(), for the boot CPU. 909 * For some guest types (Xen PV, SEV-SNP, TDX) it is required to be 910 * called before cache_bp_init() for setting up MTRR state. 911 */ 912 init_hypervisor_platform(); 913 914 tsc_early_init(); 915 x86_init.resources.probe_roms(); 916 917 /* after parse_early_param, so could debug it */ 918 insert_resource(&iomem_resource, &code_resource); 919 insert_resource(&iomem_resource, &rodata_resource); 920 insert_resource(&iomem_resource, &data_resource); 921 insert_resource(&iomem_resource, &bss_resource); 922 923 e820_add_kernel_range(); 924 trim_bios_range(); 925 #ifdef CONFIG_X86_32 926 if (ppro_with_ram_bug()) { 927 e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM, 928 E820_TYPE_RESERVED); 929 e820__update_table(e820_table); 930 printk(KERN_INFO "fixed physical RAM map:\n"); 931 e820__print_table("bad_ppro"); 932 } 933 #else 934 early_gart_iommu_check(); 935 #endif 936 937 /* 938 * partially used pages are not usable - thus 939 * we are rounding upwards: 940 */ 941 max_pfn = e820__end_of_ram_pfn(); 942 943 /* update e820 for memory not covered by WB MTRRs */ 944 cache_bp_init(); 945 if (mtrr_trim_uncached_memory(max_pfn)) 946 max_pfn = e820__end_of_ram_pfn(); 947 948 max_possible_pfn = max_pfn; 949 950 /* 951 * Define random base addresses for memory sections after max_pfn is 952 * defined and before each memory section base is used. 953 */ 954 kernel_randomize_memory(); 955 956 #ifdef CONFIG_X86_32 957 /* max_low_pfn get updated here */ 958 find_low_pfn_range(); 959 #else 960 check_x2apic(); 961 962 /* How many end-of-memory variables you have, grandma! */ 963 /* need this before calling reserve_initrd */ 964 if (max_pfn > (1UL<<(32 - PAGE_SHIFT))) 965 max_low_pfn = e820__end_of_low_ram_pfn(); 966 else 967 max_low_pfn = max_pfn; 968 969 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 970 #endif 971 972 /* Find and reserve MPTABLE area */ 973 x86_init.mpparse.find_mptable(); 974 975 early_alloc_pgt_buf(); 976 977 /* 978 * Need to conclude brk, before e820__memblock_setup() 979 * it could use memblock_find_in_range, could overlap with 980 * brk area. 981 */ 982 reserve_brk(); 983 984 cleanup_highmap(); 985 986 memblock_set_current_limit(ISA_END_ADDRESS); 987 e820__memblock_setup(); 988 989 /* 990 * Needs to run after memblock setup because it needs the physical 991 * memory size. 992 */ 993 mem_encrypt_setup_arch(); 994 995 efi_fake_memmap(); 996 efi_find_mirror(); 997 efi_esrt_init(); 998 efi_mokvar_table_init(); 999 1000 /* 1001 * The EFI specification says that boot service code won't be 1002 * called after ExitBootServices(). This is, in fact, a lie. 1003 */ 1004 efi_reserve_boot_services(); 1005 1006 /* preallocate 4k for mptable mpc */ 1007 e820__memblock_alloc_reserved_mpc_new(); 1008 1009 #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION 1010 setup_bios_corruption_check(); 1011 #endif 1012 1013 #ifdef CONFIG_X86_32 1014 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", 1015 (max_pfn_mapped<<PAGE_SHIFT) - 1); 1016 #endif 1017 1018 /* 1019 * Find free memory for the real mode trampoline and place it there. If 1020 * there is not enough free memory under 1M, on EFI-enabled systems 1021 * there will be additional attempt to reclaim the memory for the real 1022 * mode trampoline at efi_free_boot_services(). 1023 * 1024 * Unconditionally reserve the entire first 1M of RAM because BIOSes 1025 * are known to corrupt low memory and several hundred kilobytes are not 1026 * worth complex detection what memory gets clobbered. Windows does the 1027 * same thing for very similar reasons. 1028 * 1029 * Moreover, on machines with SandyBridge graphics or in setups that use 1030 * crashkernel the entire 1M is reserved anyway. 1031 * 1032 * Note the host kernel TDX also requires the first 1MB being reserved. 1033 */ 1034 x86_platform.realmode_reserve(); 1035 1036 init_mem_mapping(); 1037 1038 idt_setup_early_pf(); 1039 1040 /* 1041 * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) 1042 * with the current CR4 value. This may not be necessary, but 1043 * auditing all the early-boot CR4 manipulation would be needed to 1044 * rule it out. 1045 * 1046 * Mask off features that don't work outside long mode (just 1047 * PCIDE for now). 1048 */ 1049 mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE; 1050 1051 memblock_set_current_limit(get_max_mapped()); 1052 1053 /* 1054 * NOTE: On x86-32, only from this point on, fixmaps are ready for use. 1055 */ 1056 1057 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT 1058 if (init_ohci1394_dma_early) 1059 init_ohci1394_dma_on_all_controllers(); 1060 #endif 1061 /* Allocate bigger log buffer */ 1062 setup_log_buf(1); 1063 1064 if (efi_enabled(EFI_BOOT)) { 1065 switch (boot_params.secure_boot) { 1066 case efi_secureboot_mode_disabled: 1067 pr_info("Secure boot disabled\n"); 1068 break; 1069 case efi_secureboot_mode_enabled: 1070 pr_info("Secure boot enabled\n"); 1071 break; 1072 default: 1073 pr_info("Secure boot could not be determined\n"); 1074 break; 1075 } 1076 } 1077 1078 reserve_initrd(); 1079 1080 acpi_table_upgrade(); 1081 /* Look for ACPI tables and reserve memory occupied by them. */ 1082 acpi_boot_table_init(); 1083 1084 vsmp_init(); 1085 1086 io_delay_init(); 1087 1088 early_platform_quirks(); 1089 1090 /* Some platforms need the APIC registered for NUMA configuration */ 1091 early_acpi_boot_init(); 1092 x86_init.mpparse.early_parse_smp_cfg(); 1093 1094 x86_flattree_get_config(); 1095 1096 initmem_init(); 1097 dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); 1098 1099 if (boot_cpu_has(X86_FEATURE_GBPAGES)) 1100 hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); 1101 1102 /* 1103 * Reserve memory for crash kernel after SRAT is parsed so that it 1104 * won't consume hotpluggable memory. 1105 */ 1106 arch_reserve_crashkernel(); 1107 1108 memblock_find_dma_reserve(); 1109 1110 if (!early_xdbc_setup_hardware()) 1111 early_xdbc_register_console(); 1112 1113 x86_init.paging.pagetable_init(); 1114 1115 kasan_init(); 1116 1117 /* 1118 * Sync back kernel address range. 1119 * 1120 * FIXME: Can the later sync in setup_cpu_entry_areas() replace 1121 * this call? 1122 */ 1123 sync_initial_page_table(); 1124 1125 tboot_probe(); 1126 1127 map_vsyscall(); 1128 1129 x86_32_probe_apic(); 1130 1131 early_quirks(); 1132 1133 topology_apply_cmdline_limits_early(); 1134 1135 /* 1136 * Parse SMP configuration. Try ACPI first and then the platform 1137 * specific parser. 1138 */ 1139 acpi_boot_init(); 1140 x86_init.mpparse.parse_smp_cfg(); 1141 1142 /* Last opportunity to detect and map the local APIC */ 1143 init_apic_mappings(); 1144 1145 topology_init_possible_cpus(); 1146 1147 init_cpu_to_node(); 1148 init_gi_nodes(); 1149 1150 io_apic_init_mappings(); 1151 1152 x86_init.hyper.guest_late_init(); 1153 1154 e820__reserve_resources(); 1155 e820__register_nosave_regions(max_pfn); 1156 1157 x86_init.resources.reserve_resources(); 1158 1159 e820__setup_pci_gap(); 1160 1161 #ifdef CONFIG_VT 1162 #if defined(CONFIG_VGA_CONSOLE) 1163 if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) 1164 vgacon_register_screen(&screen_info); 1165 #endif 1166 #endif 1167 x86_init.oem.banner(); 1168 1169 x86_init.timers.wallclock_init(); 1170 1171 /* 1172 * This needs to run before setup_local_APIC() which soft-disables the 1173 * local APIC temporarily and that masks the thermal LVT interrupt, 1174 * leading to softlockups on machines which have configured SMI 1175 * interrupt delivery. 1176 */ 1177 therm_lvt_init(); 1178 1179 mcheck_init(); 1180 1181 register_refined_jiffies(CLOCK_TICK_RATE); 1182 1183 #ifdef CONFIG_EFI 1184 if (efi_enabled(EFI_BOOT)) 1185 efi_apply_memmap_quirks(); 1186 #endif 1187 1188 unwind_init(); 1189 } 1190 1191 #ifdef CONFIG_X86_32 1192 1193 static struct resource video_ram_resource = { 1194 .name = "Video RAM area", 1195 .start = 0xa0000, 1196 .end = 0xbffff, 1197 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 1198 }; 1199 1200 void __init i386_reserve_resources(void) 1201 { 1202 request_resource(&iomem_resource, &video_ram_resource); 1203 reserve_standard_io_resources(); 1204 } 1205 1206 #endif /* CONFIG_X86_32 */ 1207 1208 static struct notifier_block kernel_offset_notifier = { 1209 .notifier_call = dump_kernel_offset 1210 }; 1211 1212 static int __init register_kernel_offset_dumper(void) 1213 { 1214 atomic_notifier_chain_register(&panic_notifier_list, 1215 &kernel_offset_notifier); 1216 return 0; 1217 } 1218 __initcall(register_kernel_offset_dumper); 1219