1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * Copyright 2013 Joyent, Inc. All rights reserved. 27 */ 28 29 30 #include <sys/types.h> 31 #include <sys/machparam.h> 32 #include <sys/x86_archext.h> 33 #include <sys/systm.h> 34 #include <sys/mach_mmu.h> 35 #include <sys/multiboot.h> 36 #include <sys/multiboot2.h> 37 #include <sys/multiboot2_impl.h> 38 #include <sys/sysmacros.h> 39 #include <sys/sha1.h> 40 #include <util/string.h> 41 #include <util/strtolctype.h> 42 #include <sys/efi.h> 43 44 /* 45 * Compile time debug knob. We do not have any early mechanism to control it 46 * as the boot is the earliest mechanism we have, and we do not want to have 47 * it being switched on by default. 48 */ 49 int dboot_debug = 0; 50 51 #if defined(__xpv) 52 53 #include <sys/hypervisor.h> 54 uintptr_t xen_virt_start; 55 pfn_t *mfn_to_pfn_mapping; 56 57 #else /* !__xpv */ 58 59 extern multiboot_header_t mb_header; 60 extern uint32_t mb2_load_addr; 61 extern int have_cpuid(void); 62 63 #endif /* !__xpv */ 64 65 #include <sys/inttypes.h> 66 #include <sys/bootinfo.h> 67 #include <sys/mach_mmu.h> 68 #include <sys/boot_console.h> 69 70 #include "dboot_asm.h" 71 #include "dboot_printf.h" 72 #include "dboot_xboot.h" 73 #include "dboot_elfload.h" 74 75 #define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2) 76 77 /* 78 * This file contains code that runs to transition us from either a multiboot 79 * compliant loader (32 bit non-paging) or a XPV domain loader to 80 * regular kernel execution. Its task is to setup the kernel memory image 81 * and page tables. 82 * 83 * The code executes as: 84 * - 32 bits under GRUB (for 32 or 64 bit Solaris) 85 * - a 32 bit program for the 32-bit PV hypervisor 86 * - a 64 bit program for the 64-bit PV hypervisor (at least for now) 87 * 88 * Under the PV hypervisor, we must create mappings for any memory beyond the 89 * initial start of day allocation (such as the kernel itself). 90 * 91 * When on the metal, the mapping between maddr_t and paddr_t is 1:1. 92 * Since we are running in real mode, so all such memory is accessible. 93 */ 94 95 /* 96 * Standard bits used in PTE (page level) and PTP (internal levels) 97 */ 98 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER; 99 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST; 100 101 /* 102 * This is the target addresses (physical) where the kernel text and data 103 * nucleus pages will be unpacked. On the hypervisor this is actually a 104 * virtual address. 105 */ 106 paddr_t ktext_phys; 107 uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */ 108 109 static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */ 110 111 /* 112 * The stack is setup in assembler before entering startup_kernel() 113 */ 114 char stack_space[STACK_SIZE]; 115 116 /* 117 * Used to track physical memory allocation 118 */ 119 static paddr_t next_avail_addr = 0; 120 121 #if defined(__xpv) 122 /* 123 * Additional information needed for hypervisor memory allocation. 124 * Only memory up to scratch_end is mapped by page tables. 125 * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so 126 * to derive a pfn from a pointer, you subtract mfn_base. 127 */ 128 129 static paddr_t scratch_end = 0; /* we can't write all of mem here */ 130 static paddr_t mfn_base; /* addr corresponding to mfn_list[0] */ 131 start_info_t *xen_info; 132 133 #else /* __xpv */ 134 135 /* 136 * If on the metal, then we have a multiboot loader. 137 */ 138 uint32_t mb_magic; /* magic from boot loader */ 139 uint32_t mb_addr; /* multiboot info package from loader */ 140 int multiboot_version; 141 multiboot_info_t *mb_info; 142 multiboot2_info_header_t *mb2_info; 143 multiboot_tag_mmap_t *mb2_mmap_tagp; 144 int num_entries; /* mmap entry count */ 145 boolean_t num_entries_set; /* is mmap entry count set */ 146 uintptr_t load_addr; 147 148 /* can not be automatic variables because of alignment */ 149 static efi_guid_t smbios3 = SMBIOS3_TABLE_GUID; 150 static efi_guid_t smbios = SMBIOS_TABLE_GUID; 151 static efi_guid_t acpi2 = EFI_ACPI_TABLE_GUID; 152 static efi_guid_t acpi1 = ACPI_10_TABLE_GUID; 153 #endif /* __xpv */ 154 155 /* 156 * This contains information passed to the kernel 157 */ 158 struct xboot_info boot_info[2]; /* extra space to fix alignement for amd64 */ 159 struct xboot_info *bi; 160 161 /* 162 * Page table and memory stuff. 163 */ 164 static paddr_t max_mem; /* maximum memory address */ 165 166 /* 167 * Information about processor MMU 168 */ 169 int amd64_support = 0; 170 int largepage_support = 0; 171 int pae_support = 0; 172 int pge_support = 0; 173 int NX_support = 0; 174 175 /* 176 * Low 32 bits of kernel entry address passed back to assembler. 177 * When running a 64 bit kernel, the high 32 bits are 0xffffffff. 178 */ 179 uint32_t entry_addr_low; 180 181 /* 182 * Memlists for the kernel. We shouldn't need a lot of these. 183 */ 184 #define MAX_MEMLIST (50) 185 struct boot_memlist memlists[MAX_MEMLIST]; 186 uint_t memlists_used = 0; 187 struct boot_memlist pcimemlists[MAX_MEMLIST]; 188 uint_t pcimemlists_used = 0; 189 struct boot_memlist rsvdmemlists[MAX_MEMLIST]; 190 uint_t rsvdmemlists_used = 0; 191 192 /* 193 * This should match what's in the bootloader. It's arbitrary, but GRUB 194 * in particular has limitations on how much space it can use before it 195 * stops working properly. This should be enough. 196 */ 197 struct boot_modules modules[MAX_BOOT_MODULES]; 198 uint_t modules_used = 0; 199 200 #ifdef __xpv 201 /* 202 * Xen strips the size field out of the mb_memory_map_t, see struct e820entry 203 * definition in Xen source. 204 */ 205 typedef struct { 206 uint32_t base_addr_low; 207 uint32_t base_addr_high; 208 uint32_t length_low; 209 uint32_t length_high; 210 uint32_t type; 211 } mmap_t; 212 213 /* 214 * There is 512KB of scratch area after the boot stack page. 215 * We'll use that for everything except the kernel nucleus pages which are too 216 * big to fit there and are allocated last anyway. 217 */ 218 #define MAXMAPS 100 219 static mmap_t map_buffer[MAXMAPS]; 220 #else 221 typedef mb_memory_map_t mmap_t; 222 #endif 223 224 /* 225 * Debugging macros 226 */ 227 uint_t prom_debug = 0; 228 uint_t map_debug = 0; 229 230 static char noname[2] = "-"; 231 232 /* 233 * Either hypervisor-specific or grub-specific code builds the initial 234 * memlists. This code does the sort/merge/link for final use. 235 */ 236 static void 237 sort_physinstall(void) 238 { 239 int i; 240 #if !defined(__xpv) 241 int j; 242 struct boot_memlist tmp; 243 244 /* 245 * Now sort the memlists, in case they weren't in order. 246 * Yeah, this is a bubble sort; small, simple and easy to get right. 247 */ 248 DBG_MSG("Sorting phys-installed list\n"); 249 for (j = memlists_used - 1; j > 0; --j) { 250 for (i = 0; i < j; ++i) { 251 if (memlists[i].addr < memlists[i + 1].addr) 252 continue; 253 tmp = memlists[i]; 254 memlists[i] = memlists[i + 1]; 255 memlists[i + 1] = tmp; 256 } 257 } 258 259 /* 260 * Merge any memlists that don't have holes between them. 261 */ 262 for (i = 0; i <= memlists_used - 1; ++i) { 263 if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr) 264 continue; 265 266 if (prom_debug) 267 dboot_printf( 268 "merging mem segs %" PRIx64 "...%" PRIx64 269 " w/ %" PRIx64 "...%" PRIx64 "\n", 270 memlists[i].addr, 271 memlists[i].addr + memlists[i].size, 272 memlists[i + 1].addr, 273 memlists[i + 1].addr + memlists[i + 1].size); 274 275 memlists[i].size += memlists[i + 1].size; 276 for (j = i + 1; j < memlists_used - 1; ++j) 277 memlists[j] = memlists[j + 1]; 278 --memlists_used; 279 DBG(memlists_used); 280 --i; /* after merging we need to reexamine, so do this */ 281 } 282 #endif /* __xpv */ 283 284 if (prom_debug) { 285 dboot_printf("\nFinal memlists:\n"); 286 for (i = 0; i < memlists_used; ++i) { 287 dboot_printf("\t%d: addr=%" PRIx64 " size=%" 288 PRIx64 "\n", i, memlists[i].addr, memlists[i].size); 289 } 290 } 291 292 /* 293 * link together the memlists with native size pointers 294 */ 295 memlists[0].next = 0; 296 memlists[0].prev = 0; 297 for (i = 1; i < memlists_used; ++i) { 298 memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1); 299 memlists[i].next = 0; 300 memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i); 301 } 302 bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists; 303 DBG(bi->bi_phys_install); 304 } 305 306 /* 307 * build bios reserved memlists 308 */ 309 static void 310 build_rsvdmemlists(void) 311 { 312 int i; 313 314 rsvdmemlists[0].next = 0; 315 rsvdmemlists[0].prev = 0; 316 for (i = 1; i < rsvdmemlists_used; ++i) { 317 rsvdmemlists[i].prev = 318 (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1); 319 rsvdmemlists[i].next = 0; 320 rsvdmemlists[i - 1].next = 321 (native_ptr_t)(uintptr_t)(rsvdmemlists + i); 322 } 323 bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists; 324 DBG(bi->bi_rsvdmem); 325 } 326 327 #if defined(__xpv) 328 329 /* 330 * halt on the hypervisor after a delay to drain console output 331 */ 332 void 333 dboot_halt(void) 334 { 335 uint_t i = 10000; 336 337 while (--i) 338 (void) HYPERVISOR_yield(); 339 (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff); 340 } 341 342 /* 343 * From a machine address, find the corresponding pseudo-physical address. 344 * Pseudo-physical address are contiguous and run from mfn_base in each VM. 345 * Machine addresses are the real underlying hardware addresses. 346 * These are needed for page table entries. Note that this routine is 347 * poorly protected. A bad value of "ma" will cause a page fault. 348 */ 349 paddr_t 350 ma_to_pa(maddr_t ma) 351 { 352 ulong_t pgoff = ma & MMU_PAGEOFFSET; 353 ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)]; 354 paddr_t pa; 355 356 if (pfn >= xen_info->nr_pages) 357 return (-(paddr_t)1); 358 pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff; 359 #ifdef DEBUG 360 if (ma != pa_to_ma(pa)) 361 dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", " 362 "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa)); 363 #endif 364 return (pa); 365 } 366 367 /* 368 * From a pseudo-physical address, find the corresponding machine address. 369 */ 370 maddr_t 371 pa_to_ma(paddr_t pa) 372 { 373 pfn_t pfn; 374 ulong_t mfn; 375 376 pfn = mmu_btop(pa - mfn_base); 377 if (pa < mfn_base || pfn >= xen_info->nr_pages) 378 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa); 379 mfn = ((ulong_t *)xen_info->mfn_list)[pfn]; 380 #ifdef DEBUG 381 if (mfn_to_pfn_mapping[mfn] != pfn) 382 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n", 383 pfn, mfn, mfn_to_pfn_mapping[mfn]); 384 #endif 385 return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET)); 386 } 387 388 #endif /* __xpv */ 389 390 x86pte_t 391 get_pteval(paddr_t table, uint_t index) 392 { 393 if (pae_support) 394 return (((x86pte_t *)(uintptr_t)table)[index]); 395 return (((x86pte32_t *)(uintptr_t)table)[index]); 396 } 397 398 /*ARGSUSED*/ 399 void 400 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval) 401 { 402 #ifdef __xpv 403 mmu_update_t t; 404 maddr_t mtable = pa_to_ma(table); 405 int retcnt; 406 407 t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE; 408 t.val = pteval; 409 if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1) 410 dboot_panic("HYPERVISOR_mmu_update() failed"); 411 #else /* __xpv */ 412 uintptr_t tab_addr = (uintptr_t)table; 413 414 if (pae_support) 415 ((x86pte_t *)tab_addr)[index] = pteval; 416 else 417 ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval; 418 if (level == top_level && level == 2) 419 reload_cr3(); 420 #endif /* __xpv */ 421 } 422 423 paddr_t 424 make_ptable(x86pte_t *pteval, uint_t level) 425 { 426 paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); 427 428 if (level == top_level && level == 2) 429 *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID; 430 else 431 *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits; 432 433 #ifdef __xpv 434 /* Remove write permission to the new page table. */ 435 if (HYPERVISOR_update_va_mapping(new_table, 436 *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL)) 437 dboot_panic("HYP_update_va_mapping error"); 438 #endif 439 440 if (map_debug) 441 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%" 442 PRIx64 "\n", level, (ulong_t)new_table, *pteval); 443 return (new_table); 444 } 445 446 x86pte_t * 447 map_pte(paddr_t table, uint_t index) 448 { 449 return ((x86pte_t *)(uintptr_t)(table + index * pte_size)); 450 } 451 452 /* 453 * dump out the contents of page tables... 454 */ 455 static void 456 dump_tables(void) 457 { 458 uint_t save_index[4]; /* for recursion */ 459 char *save_table[4]; /* for recursion */ 460 uint_t l; 461 uint64_t va; 462 uint64_t pgsize; 463 int index; 464 int i; 465 x86pte_t pteval; 466 char *table; 467 static char *tablist = "\t\t\t"; 468 char *tabs = tablist + 3 - top_level; 469 uint_t pa, pa1; 470 #if !defined(__xpv) 471 #define maddr_t paddr_t 472 #endif /* !__xpv */ 473 474 dboot_printf("Finished pagetables:\n"); 475 table = (char *)(uintptr_t)top_page_table; 476 l = top_level; 477 va = 0; 478 for (index = 0; index < ptes_per_table; ++index) { 479 pgsize = 1ull << shift_amt[l]; 480 if (pae_support) 481 pteval = ((x86pte_t *)table)[index]; 482 else 483 pteval = ((x86pte32_t *)table)[index]; 484 if (pteval == 0) 485 goto next_entry; 486 487 dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64, 488 tabs + l, (void *)table, index, (uint64_t)pteval, va); 489 pa = ma_to_pa(pteval & MMU_PAGEMASK); 490 dboot_printf(" physaddr=%x\n", pa); 491 492 /* 493 * Don't try to walk hypervisor private pagetables 494 */ 495 if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) { 496 save_table[l] = table; 497 save_index[l] = index; 498 --l; 499 index = -1; 500 table = (char *)(uintptr_t) 501 ma_to_pa(pteval & MMU_PAGEMASK); 502 goto recursion; 503 } 504 505 /* 506 * shorten dump for consecutive mappings 507 */ 508 for (i = 1; index + i < ptes_per_table; ++i) { 509 if (pae_support) 510 pteval = ((x86pte_t *)table)[index + i]; 511 else 512 pteval = ((x86pte32_t *)table)[index + i]; 513 if (pteval == 0) 514 break; 515 pa1 = ma_to_pa(pteval & MMU_PAGEMASK); 516 if (pa1 != pa + i * pgsize) 517 break; 518 } 519 if (i > 2) { 520 dboot_printf("%s...\n", tabs + l); 521 va += pgsize * (i - 2); 522 index += i - 2; 523 } 524 next_entry: 525 va += pgsize; 526 if (l == 3 && index == 256) /* VA hole */ 527 va = 0xffff800000000000ull; 528 recursion: 529 ; 530 } 531 if (l < top_level) { 532 ++l; 533 index = save_index[l]; 534 table = save_table[l]; 535 goto recursion; 536 } 537 } 538 539 /* 540 * Add a mapping for the machine page at the given virtual address. 541 */ 542 static void 543 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level) 544 { 545 x86pte_t *ptep; 546 x86pte_t pteval; 547 548 pteval = ma | pte_bits; 549 if (level > 0) 550 pteval |= PT_PAGESIZE; 551 if (va >= target_kernel_text && pge_support) 552 pteval |= PT_GLOBAL; 553 554 if (map_debug && ma != va) 555 dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64 556 " pte=0x%" PRIx64 " l=%d\n", 557 (uint64_t)ma, (uint64_t)va, pteval, level); 558 559 #if defined(__xpv) 560 /* 561 * see if we can avoid find_pte() on the hypervisor 562 */ 563 if (HYPERVISOR_update_va_mapping(va, pteval, 564 UVMF_INVLPG | UVMF_LOCAL) == 0) 565 return; 566 #endif 567 568 /* 569 * Find the pte that will map this address. This creates any 570 * missing intermediate level page tables 571 */ 572 ptep = find_pte(va, NULL, level, 0); 573 574 /* 575 * When paravirtualized, we must use hypervisor calls to modify the 576 * PTE, since paging is active. On real hardware we just write to 577 * the pagetables which aren't in use yet. 578 */ 579 #if defined(__xpv) 580 ptep = ptep; /* shut lint up */ 581 if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL)) 582 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64 583 " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "", 584 (uint64_t)va, level, (uint64_t)ma, pteval); 585 #else 586 if (va < 1024 * 1024) 587 pteval |= PT_NOCACHE; /* for video RAM */ 588 if (pae_support) 589 *ptep = pteval; 590 else 591 *((x86pte32_t *)ptep) = (x86pte32_t)pteval; 592 #endif 593 } 594 595 /* 596 * Add a mapping for the physical page at the given virtual address. 597 */ 598 static void 599 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level) 600 { 601 map_ma_at_va(pa_to_ma(pa), va, level); 602 } 603 604 /* 605 * This is called to remove start..end from the 606 * possible range of PCI addresses. 607 */ 608 const uint64_t pci_lo_limit = 0x00100000ul; 609 const uint64_t pci_hi_limit = 0xfff00000ul; 610 static void 611 exclude_from_pci(uint64_t start, uint64_t end) 612 { 613 int i; 614 int j; 615 struct boot_memlist *ml; 616 617 for (i = 0; i < pcimemlists_used; ++i) { 618 ml = &pcimemlists[i]; 619 620 /* delete the entire range? */ 621 if (start <= ml->addr && ml->addr + ml->size <= end) { 622 --pcimemlists_used; 623 for (j = i; j < pcimemlists_used; ++j) 624 pcimemlists[j] = pcimemlists[j + 1]; 625 --i; /* to revisit the new one at this index */ 626 } 627 628 /* split a range? */ 629 else if (ml->addr < start && end < ml->addr + ml->size) { 630 631 ++pcimemlists_used; 632 if (pcimemlists_used > MAX_MEMLIST) 633 dboot_panic("too many pcimemlists"); 634 635 for (j = pcimemlists_used - 1; j > i; --j) 636 pcimemlists[j] = pcimemlists[j - 1]; 637 ml->size = start - ml->addr; 638 639 ++ml; 640 ml->size = (ml->addr + ml->size) - end; 641 ml->addr = end; 642 ++i; /* skip on to next one */ 643 } 644 645 /* cut memory off the start? */ 646 else if (ml->addr < end && end < ml->addr + ml->size) { 647 ml->size -= end - ml->addr; 648 ml->addr = end; 649 } 650 651 /* cut memory off the end? */ 652 else if (ml->addr <= start && start < ml->addr + ml->size) { 653 ml->size = start - ml->addr; 654 } 655 } 656 } 657 658 /* 659 * During memory allocation, find the highest address not used yet. 660 */ 661 static void 662 check_higher(paddr_t a) 663 { 664 if (a < next_avail_addr) 665 return; 666 next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE); 667 DBG(next_avail_addr); 668 } 669 670 static int 671 dboot_loader_mmap_entries(void) 672 { 673 #if !defined(__xpv) 674 if (num_entries_set == B_TRUE) 675 return (num_entries); 676 677 switch (multiboot_version) { 678 case 1: 679 DBG(mb_info->flags); 680 if (mb_info->flags & 0x40) { 681 mb_memory_map_t *mmap; 682 683 DBG(mb_info->mmap_addr); 684 DBG(mb_info->mmap_length); 685 check_higher(mb_info->mmap_addr + mb_info->mmap_length); 686 687 for (mmap = (mb_memory_map_t *)mb_info->mmap_addr; 688 (uint32_t)mmap < mb_info->mmap_addr + 689 mb_info->mmap_length; 690 mmap = (mb_memory_map_t *)((uint32_t)mmap + 691 mmap->size + sizeof (mmap->size))) 692 ++num_entries; 693 694 num_entries_set = B_TRUE; 695 } 696 break; 697 case 2: 698 num_entries_set = B_TRUE; 699 num_entries = dboot_multiboot2_mmap_nentries(mb2_info, 700 mb2_mmap_tagp); 701 break; 702 default: 703 dboot_panic("Unknown multiboot version: %d\n", 704 multiboot_version); 705 break; 706 } 707 return (num_entries); 708 #else 709 return (MAXMAPS); 710 #endif 711 } 712 713 static uint32_t 714 dboot_loader_mmap_get_type(int index) 715 { 716 #if !defined(__xpv) 717 mb_memory_map_t *mp, *mpend; 718 int i; 719 720 switch (multiboot_version) { 721 case 1: 722 mp = (mb_memory_map_t *)mb_info->mmap_addr; 723 mpend = (mb_memory_map_t *) 724 (mb_info->mmap_addr + mb_info->mmap_length); 725 726 for (i = 0; mp < mpend && i != index; i++) 727 mp = (mb_memory_map_t *)((uint32_t)mp + mp->size + 728 sizeof (mp->size)); 729 if (mp >= mpend) { 730 dboot_panic("dboot_loader_mmap_get_type(): index " 731 "out of bounds: %d\n", index); 732 } 733 return (mp->type); 734 735 case 2: 736 return (dboot_multiboot2_mmap_get_type(mb2_info, 737 mb2_mmap_tagp, index)); 738 739 default: 740 dboot_panic("Unknown multiboot version: %d\n", 741 multiboot_version); 742 break; 743 } 744 return (0); 745 #else 746 return (map_buffer[index].type); 747 #endif 748 } 749 750 static uint64_t 751 dboot_loader_mmap_get_base(int index) 752 { 753 #if !defined(__xpv) 754 mb_memory_map_t *mp, *mpend; 755 int i; 756 757 switch (multiboot_version) { 758 case 1: 759 mp = (mb_memory_map_t *)mb_info->mmap_addr; 760 mpend = (mb_memory_map_t *) 761 (mb_info->mmap_addr + mb_info->mmap_length); 762 763 for (i = 0; mp < mpend && i != index; i++) 764 mp = (mb_memory_map_t *)((uint32_t)mp + mp->size + 765 sizeof (mp->size)); 766 if (mp >= mpend) { 767 dboot_panic("dboot_loader_mmap_get_base(): index " 768 "out of bounds: %d\n", index); 769 } 770 return (((uint64_t)mp->base_addr_high << 32) + 771 (uint64_t)mp->base_addr_low); 772 773 case 2: 774 return (dboot_multiboot2_mmap_get_base(mb2_info, 775 mb2_mmap_tagp, index)); 776 777 default: 778 dboot_panic("Unknown multiboot version: %d\n", 779 multiboot_version); 780 break; 781 } 782 return (0); 783 #else 784 return (((uint64_t)map_buffer[index].base_addr_high << 32) + 785 (uint64_t)map_buffer[index].base_addr_low); 786 #endif 787 } 788 789 static uint64_t 790 dboot_loader_mmap_get_length(int index) 791 { 792 #if !defined(__xpv) 793 mb_memory_map_t *mp, *mpend; 794 int i; 795 796 switch (multiboot_version) { 797 case 1: 798 mp = (mb_memory_map_t *)mb_info->mmap_addr; 799 mpend = (mb_memory_map_t *) 800 (mb_info->mmap_addr + mb_info->mmap_length); 801 802 for (i = 0; mp < mpend && i != index; i++) 803 mp = (mb_memory_map_t *)((uint32_t)mp + mp->size + 804 sizeof (mp->size)); 805 if (mp >= mpend) { 806 dboot_panic("dboot_loader_mmap_get_length(): index " 807 "out of bounds: %d\n", index); 808 } 809 return (((uint64_t)mp->length_high << 32) + 810 (uint64_t)mp->length_low); 811 812 case 2: 813 return (dboot_multiboot2_mmap_get_length(mb2_info, 814 mb2_mmap_tagp, index)); 815 816 default: 817 dboot_panic("Unknown multiboot version: %d\n", 818 multiboot_version); 819 break; 820 } 821 return (0); 822 #else 823 return (((uint64_t)map_buffer[index].length_high << 32) + 824 (uint64_t)map_buffer[index].length_low); 825 #endif 826 } 827 828 static void 829 build_pcimemlists(void) 830 { 831 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */ 832 uint64_t start; 833 uint64_t end; 834 int i, num; 835 836 /* 837 * initialize 838 */ 839 pcimemlists[0].addr = pci_lo_limit; 840 pcimemlists[0].size = pci_hi_limit - pci_lo_limit; 841 pcimemlists_used = 1; 842 843 num = dboot_loader_mmap_entries(); 844 /* 845 * Fill in PCI memlists. 846 */ 847 for (i = 0; i < num; ++i) { 848 start = dboot_loader_mmap_get_base(i); 849 end = start + dboot_loader_mmap_get_length(i); 850 851 if (prom_debug) 852 dboot_printf("\ttype: %d %" PRIx64 "..%" 853 PRIx64 "\n", dboot_loader_mmap_get_type(i), 854 start, end); 855 856 /* 857 * page align start and end 858 */ 859 start = (start + page_offset) & ~page_offset; 860 end &= ~page_offset; 861 if (end <= start) 862 continue; 863 864 exclude_from_pci(start, end); 865 } 866 867 /* 868 * Finish off the pcimemlist 869 */ 870 if (prom_debug) { 871 for (i = 0; i < pcimemlists_used; ++i) { 872 dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%" 873 PRIx64 "\n", pcimemlists[i].addr, 874 pcimemlists[i].addr + pcimemlists[i].size); 875 } 876 } 877 pcimemlists[0].next = 0; 878 pcimemlists[0].prev = 0; 879 for (i = 1; i < pcimemlists_used; ++i) { 880 pcimemlists[i].prev = 881 (native_ptr_t)(uintptr_t)(pcimemlists + i - 1); 882 pcimemlists[i].next = 0; 883 pcimemlists[i - 1].next = 884 (native_ptr_t)(uintptr_t)(pcimemlists + i); 885 } 886 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists; 887 DBG(bi->bi_pcimem); 888 } 889 890 #if defined(__xpv) 891 /* 892 * Initialize memory allocator stuff from hypervisor-supplied start info. 893 */ 894 static void 895 init_mem_alloc(void) 896 { 897 int local; /* variables needed to find start region */ 898 paddr_t scratch_start; 899 xen_memory_map_t map; 900 901 DBG_MSG("Entered init_mem_alloc()\n"); 902 903 /* 904 * Free memory follows the stack. There's at least 512KB of scratch 905 * space, rounded up to at least 2Mb alignment. That should be enough 906 * for the page tables we'll need to build. The nucleus memory is 907 * allocated last and will be outside the addressible range. We'll 908 * switch to new page tables before we unpack the kernel 909 */ 910 scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE); 911 DBG(scratch_start); 912 scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG); 913 DBG(scratch_end); 914 915 /* 916 * For paranoia, leave some space between hypervisor data and ours. 917 * Use 500 instead of 512. 918 */ 919 next_avail_addr = scratch_end - 500 * 1024; 920 DBG(next_avail_addr); 921 922 /* 923 * The domain builder gives us at most 1 module 924 */ 925 DBG(xen_info->mod_len); 926 if (xen_info->mod_len > 0) { 927 DBG(xen_info->mod_start); 928 modules[0].bm_addr = 929 (native_ptr_t)(uintptr_t)xen_info->mod_start; 930 modules[0].bm_size = xen_info->mod_len; 931 bi->bi_module_cnt = 1; 932 bi->bi_modules = (native_ptr_t)(uintptr_t)modules; 933 } else { 934 bi->bi_module_cnt = 0; 935 bi->bi_modules = (native_ptr_t)(uintptr_t)NULL; 936 } 937 DBG(bi->bi_module_cnt); 938 DBG(bi->bi_modules); 939 940 DBG(xen_info->mfn_list); 941 DBG(xen_info->nr_pages); 942 max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT; 943 DBG(max_mem); 944 945 /* 946 * Using pseudo-physical addresses, so only 1 memlist element 947 */ 948 memlists[0].addr = 0; 949 DBG(memlists[0].addr); 950 memlists[0].size = max_mem; 951 DBG(memlists[0].size); 952 memlists_used = 1; 953 DBG(memlists_used); 954 955 /* 956 * finish building physinstall list 957 */ 958 sort_physinstall(); 959 960 /* 961 * build bios reserved memlists 962 */ 963 build_rsvdmemlists(); 964 965 if (DOMAIN_IS_INITDOMAIN(xen_info)) { 966 /* 967 * build PCI Memory list 968 */ 969 map.nr_entries = MAXMAPS; 970 /*LINTED: constant in conditional context*/ 971 set_xen_guest_handle(map.buffer, map_buffer); 972 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0) 973 dboot_panic("getting XENMEM_machine_memory_map failed"); 974 build_pcimemlists(); 975 } 976 } 977 978 #else /* !__xpv */ 979 980 static void 981 dboot_multiboot1_xboot_consinfo(void) 982 { 983 bi->bi_framebuffer = NULL; 984 } 985 986 static void 987 dboot_multiboot2_xboot_consinfo(void) 988 { 989 multiboot_tag_framebuffer_t *fb; 990 fb = dboot_multiboot2_find_tag(mb2_info, 991 MULTIBOOT_TAG_TYPE_FRAMEBUFFER); 992 bi->bi_framebuffer = (native_ptr_t)(uintptr_t)fb; 993 } 994 995 static int 996 dboot_multiboot_modcount(void) 997 { 998 switch (multiboot_version) { 999 case 1: 1000 return (mb_info->mods_count); 1001 1002 case 2: 1003 return (dboot_multiboot2_modcount(mb2_info)); 1004 1005 default: 1006 dboot_panic("Unknown multiboot version: %d\n", 1007 multiboot_version); 1008 break; 1009 } 1010 return (0); 1011 } 1012 1013 static uint32_t 1014 dboot_multiboot_modstart(int index) 1015 { 1016 switch (multiboot_version) { 1017 case 1: 1018 return (((mb_module_t *)mb_info->mods_addr)[index].mod_start); 1019 1020 case 2: 1021 return (dboot_multiboot2_modstart(mb2_info, index)); 1022 1023 default: 1024 dboot_panic("Unknown multiboot version: %d\n", 1025 multiboot_version); 1026 break; 1027 } 1028 return (0); 1029 } 1030 1031 static uint32_t 1032 dboot_multiboot_modend(int index) 1033 { 1034 switch (multiboot_version) { 1035 case 1: 1036 return (((mb_module_t *)mb_info->mods_addr)[index].mod_end); 1037 1038 case 2: 1039 return (dboot_multiboot2_modend(mb2_info, index)); 1040 1041 default: 1042 dboot_panic("Unknown multiboot version: %d\n", 1043 multiboot_version); 1044 break; 1045 } 1046 return (0); 1047 } 1048 1049 static char * 1050 dboot_multiboot_modcmdline(int index) 1051 { 1052 switch (multiboot_version) { 1053 case 1: 1054 return ((char *)((mb_module_t *) 1055 mb_info->mods_addr)[index].mod_name); 1056 1057 case 2: 1058 return (dboot_multiboot2_modcmdline(mb2_info, index)); 1059 1060 default: 1061 dboot_panic("Unknown multiboot version: %d\n", 1062 multiboot_version); 1063 break; 1064 } 1065 return (0); 1066 } 1067 1068 /* 1069 * Find the environment module for console setup. 1070 * Since we need the console to print early boot messages, the console is set up 1071 * before anything else and therefore we need to pick up the environment module 1072 * early too. 1073 * 1074 * Note, we just will search for and if found, will pass the env 1075 * module to console setup, the proper module list processing will happen later. 1076 */ 1077 static void 1078 dboot_find_env(void) 1079 { 1080 int i, modcount; 1081 uint32_t mod_start, mod_end; 1082 char *cmdline; 1083 1084 modcount = dboot_multiboot_modcount(); 1085 1086 for (i = 0; i < modcount; ++i) { 1087 cmdline = dboot_multiboot_modcmdline(i); 1088 if (cmdline == NULL) 1089 continue; 1090 1091 if (strstr(cmdline, "type=environment") == NULL) 1092 continue; 1093 1094 mod_start = dboot_multiboot_modstart(i); 1095 mod_end = dboot_multiboot_modend(i); 1096 modules[0].bm_addr = (native_ptr_t)(uintptr_t)mod_start; 1097 modules[0].bm_size = mod_end - mod_start; 1098 modules[0].bm_name = (native_ptr_t)(uintptr_t)NULL; 1099 modules[0].bm_hash = (native_ptr_t)(uintptr_t)NULL; 1100 modules[0].bm_type = BMT_ENV; 1101 bi->bi_modules = (native_ptr_t)(uintptr_t)modules; 1102 bi->bi_module_cnt = 1; 1103 return; 1104 } 1105 } 1106 1107 static boolean_t 1108 dboot_multiboot_basicmeminfo(uint32_t *lower, uint32_t *upper) 1109 { 1110 boolean_t rv = B_FALSE; 1111 1112 switch (multiboot_version) { 1113 case 1: 1114 if (mb_info->flags & 0x01) { 1115 *lower = mb_info->mem_lower; 1116 *upper = mb_info->mem_upper; 1117 rv = B_TRUE; 1118 } 1119 break; 1120 1121 case 2: 1122 return (dboot_multiboot2_basicmeminfo(mb2_info, lower, upper)); 1123 1124 default: 1125 dboot_panic("Unknown multiboot version: %d\n", 1126 multiboot_version); 1127 break; 1128 } 1129 return (rv); 1130 } 1131 1132 static uint8_t 1133 dboot_a2h(char v) 1134 { 1135 if (v >= 'a') 1136 return (v - 'a' + 0xa); 1137 else if (v >= 'A') 1138 return (v - 'A' + 0xa); 1139 else if (v >= '0') 1140 return (v - '0'); 1141 else 1142 dboot_panic("bad ASCII hex character %c\n", v); 1143 1144 return (0); 1145 } 1146 1147 static void 1148 digest_a2h(const char *ascii, uint8_t *digest) 1149 { 1150 unsigned int i; 1151 1152 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) { 1153 digest[i] = dboot_a2h(ascii[i * 2]) << 4; 1154 digest[i] |= dboot_a2h(ascii[i * 2 + 1]); 1155 } 1156 } 1157 1158 /* 1159 * Generate a SHA-1 hash of the first len bytes of image, and compare it with 1160 * the ASCII-format hash found in the 40-byte buffer at ascii. If they 1161 * match, return 0, otherwise -1. This works only for images smaller than 1162 * 4 GB, which should not be a problem. 1163 */ 1164 static int 1165 check_image_hash(uint_t midx) 1166 { 1167 const char *ascii; 1168 const void *image; 1169 size_t len; 1170 SHA1_CTX ctx; 1171 uint8_t digest[SHA1_DIGEST_LENGTH]; 1172 uint8_t baseline[SHA1_DIGEST_LENGTH]; 1173 unsigned int i; 1174 1175 ascii = (const char *)(uintptr_t)modules[midx].bm_hash; 1176 image = (const void *)(uintptr_t)modules[midx].bm_addr; 1177 len = (size_t)modules[midx].bm_size; 1178 1179 digest_a2h(ascii, baseline); 1180 1181 SHA1Init(&ctx); 1182 SHA1Update(&ctx, image, len); 1183 SHA1Final(digest, &ctx); 1184 1185 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) { 1186 if (digest[i] != baseline[i]) 1187 return (-1); 1188 } 1189 1190 return (0); 1191 } 1192 1193 static const char * 1194 type_to_str(boot_module_type_t type) 1195 { 1196 switch (type) { 1197 case BMT_ROOTFS: 1198 return ("rootfs"); 1199 case BMT_FILE: 1200 return ("file"); 1201 case BMT_HASH: 1202 return ("hash"); 1203 case BMT_ENV: 1204 return ("environment"); 1205 default: 1206 return ("unknown"); 1207 } 1208 } 1209 1210 static void 1211 check_images(void) 1212 { 1213 uint_t i; 1214 char displayhash[SHA1_ASCII_LENGTH + 1]; 1215 1216 for (i = 0; i < modules_used; i++) { 1217 if (prom_debug) { 1218 dboot_printf("module #%d: name %s type %s " 1219 "addr %lx size %lx\n", 1220 i, (char *)(uintptr_t)modules[i].bm_name, 1221 type_to_str(modules[i].bm_type), 1222 (ulong_t)modules[i].bm_addr, 1223 (ulong_t)modules[i].bm_size); 1224 } 1225 1226 if (modules[i].bm_type == BMT_HASH || 1227 modules[i].bm_hash == (native_ptr_t)(uintptr_t)NULL) { 1228 DBG_MSG("module has no hash; skipping check\n"); 1229 continue; 1230 } 1231 (void) memcpy(displayhash, 1232 (void *)(uintptr_t)modules[i].bm_hash, 1233 SHA1_ASCII_LENGTH); 1234 displayhash[SHA1_ASCII_LENGTH] = '\0'; 1235 if (prom_debug) { 1236 dboot_printf("checking expected hash [%s]: ", 1237 displayhash); 1238 } 1239 1240 if (check_image_hash(i) != 0) 1241 dboot_panic("hash mismatch!\n"); 1242 else 1243 DBG_MSG("OK\n"); 1244 } 1245 } 1246 1247 /* 1248 * Determine the module's starting address, size, name, and type, and fill the 1249 * boot_modules structure. This structure is used by the bop code, except for 1250 * hashes which are checked prior to transferring control to the kernel. 1251 */ 1252 static void 1253 process_module(int midx) 1254 { 1255 uint32_t mod_start = dboot_multiboot_modstart(midx); 1256 uint32_t mod_end = dboot_multiboot_modend(midx); 1257 char *cmdline = dboot_multiboot_modcmdline(midx); 1258 char *p, *q; 1259 1260 check_higher(mod_end); 1261 if (prom_debug) { 1262 dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n", 1263 midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end); 1264 } 1265 1266 if (mod_start > mod_end) { 1267 dboot_panic("module #%d: module start address 0x%lx greater " 1268 "than end address 0x%lx", midx, 1269 (ulong_t)mod_start, (ulong_t)mod_end); 1270 } 1271 1272 /* 1273 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes 1274 * the address of the last valid byte in a module plus 1 as mod_end. 1275 * This is of course a bug; the multiboot specification simply states 1276 * that mod_start and mod_end "contain the start and end addresses of 1277 * the boot module itself" which is pretty obviously not what GRUB is 1278 * doing. However, fixing it requires that not only this code be 1279 * changed but also that other code consuming this value and values 1280 * derived from it be fixed, and that the kernel and GRUB must either 1281 * both have the bug or neither. While there are a lot of combinations 1282 * that will work, there are also some that won't, so for simplicity 1283 * we'll just cope with the bug. That means we won't actually hash the 1284 * byte at mod_end, and we will expect that mod_end for the hash file 1285 * itself is one greater than some multiple of 41 (40 bytes of ASCII 1286 * hash plus a newline for each module). We set bm_size to the true 1287 * correct number of bytes in each module, achieving exactly this. 1288 */ 1289 1290 modules[midx].bm_addr = (native_ptr_t)(uintptr_t)mod_start; 1291 modules[midx].bm_size = mod_end - mod_start; 1292 modules[midx].bm_name = (native_ptr_t)(uintptr_t)cmdline; 1293 modules[midx].bm_hash = (native_ptr_t)(uintptr_t)NULL; 1294 modules[midx].bm_type = BMT_FILE; 1295 1296 if (cmdline == NULL) { 1297 modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname; 1298 return; 1299 } 1300 1301 p = cmdline; 1302 modules[midx].bm_name = 1303 (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r"); 1304 1305 while (p != NULL) { 1306 q = strsep(&p, " \t\f\n\r"); 1307 if (strncmp(q, "name=", 5) == 0) { 1308 if (q[5] != '\0' && !isspace(q[5])) { 1309 modules[midx].bm_name = 1310 (native_ptr_t)(uintptr_t)(q + 5); 1311 } 1312 continue; 1313 } 1314 1315 if (strncmp(q, "type=", 5) == 0) { 1316 if (q[5] == '\0' || isspace(q[5])) 1317 continue; 1318 q += 5; 1319 if (strcmp(q, "rootfs") == 0) { 1320 modules[midx].bm_type = BMT_ROOTFS; 1321 } else if (strcmp(q, "hash") == 0) { 1322 modules[midx].bm_type = BMT_HASH; 1323 } else if (strcmp(q, "environment") == 0) { 1324 modules[midx].bm_type = BMT_ENV; 1325 } else if (strcmp(q, "file") != 0) { 1326 dboot_printf("\tmodule #%d: unknown module " 1327 "type '%s'; defaulting to 'file'", 1328 midx, q); 1329 } 1330 continue; 1331 } 1332 1333 if (strncmp(q, "hash=", 5) == 0) { 1334 if (q[5] != '\0' && !isspace(q[5])) { 1335 modules[midx].bm_hash = 1336 (native_ptr_t)(uintptr_t)(q + 5); 1337 } 1338 continue; 1339 } 1340 1341 dboot_printf("ignoring unknown option '%s'\n", q); 1342 } 1343 } 1344 1345 /* 1346 * Backward compatibility: if there are exactly one or two modules, both 1347 * of type 'file' and neither with an embedded hash value, we have been 1348 * given the legacy style modules. In this case we need to treat the first 1349 * module as a rootfs and the second as a hash referencing that module. 1350 * Otherwise, even if the configuration is invalid, we assume that the 1351 * operator knows what he's doing or at least isn't being bitten by this 1352 * interface change. 1353 */ 1354 static void 1355 fixup_modules(void) 1356 { 1357 if (modules_used == 0 || modules_used > 2) 1358 return; 1359 1360 if (modules[0].bm_type != BMT_FILE || 1361 modules_used > 1 && modules[1].bm_type != BMT_FILE) { 1362 return; 1363 } 1364 1365 if (modules[0].bm_hash != (native_ptr_t)(uintptr_t)NULL || 1366 modules_used > 1 && 1367 modules[1].bm_hash != (native_ptr_t)(uintptr_t)NULL) { 1368 return; 1369 } 1370 1371 modules[0].bm_type = BMT_ROOTFS; 1372 if (modules_used > 1) { 1373 modules[1].bm_type = BMT_HASH; 1374 modules[1].bm_name = modules[0].bm_name; 1375 } 1376 } 1377 1378 /* 1379 * For modules that do not have assigned hashes but have a separate hash module, 1380 * find the assigned hash module and set the primary module's bm_hash to point 1381 * to the hash data from that module. We will then ignore modules of type 1382 * BMT_HASH from this point forward. 1383 */ 1384 static void 1385 assign_module_hashes(void) 1386 { 1387 uint_t i, j; 1388 1389 for (i = 0; i < modules_used; i++) { 1390 if (modules[i].bm_type == BMT_HASH || 1391 modules[i].bm_hash != (native_ptr_t)(uintptr_t)NULL) { 1392 continue; 1393 } 1394 1395 for (j = 0; j < modules_used; j++) { 1396 if (modules[j].bm_type != BMT_HASH || 1397 strcmp((char *)(uintptr_t)modules[j].bm_name, 1398 (char *)(uintptr_t)modules[i].bm_name) != 0) { 1399 continue; 1400 } 1401 1402 if (modules[j].bm_size < SHA1_ASCII_LENGTH) { 1403 dboot_printf("Short hash module of length " 1404 "0x%lx bytes; ignoring\n", 1405 (ulong_t)modules[j].bm_size); 1406 } else { 1407 modules[i].bm_hash = modules[j].bm_addr; 1408 } 1409 break; 1410 } 1411 } 1412 } 1413 1414 /* 1415 * Walk through the module information finding the last used address. 1416 * The first available address will become the top level page table. 1417 */ 1418 static void 1419 dboot_process_modules(void) 1420 { 1421 int i, modcount; 1422 extern char _end[]; 1423 1424 DBG_MSG("\nFinding Modules\n"); 1425 modcount = dboot_multiboot_modcount(); 1426 if (modcount > MAX_BOOT_MODULES) { 1427 dboot_panic("Too many modules (%d) -- the maximum is %d.", 1428 modcount, MAX_BOOT_MODULES); 1429 } 1430 /* 1431 * search the modules to find the last used address 1432 * we'll build the module list while we're walking through here 1433 */ 1434 check_higher((paddr_t)(uintptr_t)&_end); 1435 for (i = 0; i < modcount; ++i) { 1436 process_module(i); 1437 modules_used++; 1438 } 1439 bi->bi_modules = (native_ptr_t)(uintptr_t)modules; 1440 DBG(bi->bi_modules); 1441 bi->bi_module_cnt = modcount; 1442 DBG(bi->bi_module_cnt); 1443 1444 fixup_modules(); 1445 assign_module_hashes(); 1446 check_images(); 1447 } 1448 1449 /* 1450 * We then build the phys_install memlist from the multiboot information. 1451 */ 1452 static void 1453 dboot_process_mmap(void) 1454 { 1455 uint64_t start; 1456 uint64_t end; 1457 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */ 1458 uint32_t lower, upper; 1459 int i, mmap_entries; 1460 1461 /* 1462 * Walk through the memory map from multiboot and build our memlist 1463 * structures. Note these will have native format pointers. 1464 */ 1465 DBG_MSG("\nFinding Memory Map\n"); 1466 num_entries = 0; 1467 num_entries_set = B_FALSE; 1468 max_mem = 0; 1469 if ((mmap_entries = dboot_loader_mmap_entries()) > 0) { 1470 for (i = 0; i < mmap_entries; i++) { 1471 uint32_t type = dboot_loader_mmap_get_type(i); 1472 start = dboot_loader_mmap_get_base(i); 1473 end = start + dboot_loader_mmap_get_length(i); 1474 1475 if (prom_debug) 1476 dboot_printf("\ttype: %d %" PRIx64 "..%" 1477 PRIx64 "\n", type, start, end); 1478 1479 /* 1480 * page align start and end 1481 */ 1482 start = (start + page_offset) & ~page_offset; 1483 end &= ~page_offset; 1484 if (end <= start) 1485 continue; 1486 1487 /* 1488 * only type 1 is usable RAM 1489 */ 1490 switch (type) { 1491 case 1: 1492 if (end > max_mem) 1493 max_mem = end; 1494 memlists[memlists_used].addr = start; 1495 memlists[memlists_used].size = end - start; 1496 ++memlists_used; 1497 if (memlists_used > MAX_MEMLIST) 1498 dboot_panic("too many memlists"); 1499 break; 1500 case 2: 1501 rsvdmemlists[rsvdmemlists_used].addr = start; 1502 rsvdmemlists[rsvdmemlists_used].size = 1503 end - start; 1504 ++rsvdmemlists_used; 1505 if (rsvdmemlists_used > MAX_MEMLIST) 1506 dboot_panic("too many rsvdmemlists"); 1507 break; 1508 default: 1509 continue; 1510 } 1511 } 1512 build_pcimemlists(); 1513 } else if (dboot_multiboot_basicmeminfo(&lower, &upper)) { 1514 DBG(lower); 1515 memlists[memlists_used].addr = 0; 1516 memlists[memlists_used].size = lower * 1024; 1517 ++memlists_used; 1518 DBG(upper); 1519 memlists[memlists_used].addr = 1024 * 1024; 1520 memlists[memlists_used].size = upper * 1024; 1521 ++memlists_used; 1522 1523 /* 1524 * Old platform - assume I/O space at the end of memory. 1525 */ 1526 pcimemlists[0].addr = (upper * 1024) + (1024 * 1024); 1527 pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr; 1528 pcimemlists[0].next = 0; 1529 pcimemlists[0].prev = 0; 1530 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists; 1531 DBG(bi->bi_pcimem); 1532 } else { 1533 dboot_panic("No memory info from boot loader!!!"); 1534 } 1535 1536 /* 1537 * finish processing the physinstall list 1538 */ 1539 sort_physinstall(); 1540 1541 /* 1542 * build bios reserved mem lists 1543 */ 1544 build_rsvdmemlists(); 1545 } 1546 1547 /* 1548 * The highest address is used as the starting point for dboot's simple 1549 * memory allocator. 1550 * 1551 * Finding the highest address in case of Multiboot 1 protocol is 1552 * quite painful in the sense that some information provided by 1553 * the multiboot info structure points to BIOS data, and some to RAM. 1554 * 1555 * The module list was processed and checked already by dboot_process_modules(), 1556 * so we will check the command line string and the memory map. 1557 * 1558 * This list of to be checked items is based on our current knowledge of 1559 * allocations made by grub1 and will need to be reviewed if there 1560 * are updates about the information provided by Multiboot 1. 1561 * 1562 * In the case of the Multiboot 2, our life is much simpler, as the MB2 1563 * information tag list is one contiguous chunk of memory. 1564 */ 1565 static paddr_t 1566 dboot_multiboot1_highest_addr(void) 1567 { 1568 paddr_t addr = (paddr_t)(uintptr_t)NULL; 1569 char *cmdl = (char *)mb_info->cmdline; 1570 1571 if (mb_info->flags & MB_INFO_CMDLINE) 1572 addr = ((paddr_t)((uintptr_t)cmdl + strlen(cmdl) + 1)); 1573 1574 if (mb_info->flags & MB_INFO_MEM_MAP) 1575 addr = MAX(addr, 1576 ((paddr_t)(mb_info->mmap_addr + mb_info->mmap_length))); 1577 return (addr); 1578 } 1579 1580 static void 1581 dboot_multiboot_highest_addr(void) 1582 { 1583 paddr_t addr; 1584 1585 switch (multiboot_version) { 1586 case 1: 1587 addr = dboot_multiboot1_highest_addr(); 1588 if (addr != (paddr_t)(uintptr_t)NULL) 1589 check_higher(addr); 1590 break; 1591 case 2: 1592 addr = dboot_multiboot2_highest_addr(mb2_info); 1593 if (addr != (paddr_t)(uintptr_t)NULL) 1594 check_higher(addr); 1595 break; 1596 default: 1597 dboot_panic("Unknown multiboot version: %d\n", 1598 multiboot_version); 1599 break; 1600 } 1601 } 1602 1603 /* 1604 * Walk the boot loader provided information and find the highest free address. 1605 */ 1606 static void 1607 init_mem_alloc(void) 1608 { 1609 DBG_MSG("Entered init_mem_alloc()\n"); 1610 dboot_process_modules(); 1611 dboot_process_mmap(); 1612 dboot_multiboot_highest_addr(); 1613 } 1614 1615 static int 1616 dboot_same_guids(efi_guid_t *g1, efi_guid_t *g2) 1617 { 1618 int i; 1619 1620 if (g1->time_low != g2->time_low) 1621 return (0); 1622 if (g1->time_mid != g2->time_mid) 1623 return (0); 1624 if (g1->time_hi_and_version != g2->time_hi_and_version) 1625 return (0); 1626 if (g1->clock_seq_hi_and_reserved != g2->clock_seq_hi_and_reserved) 1627 return (0); 1628 if (g1->clock_seq_low != g2->clock_seq_low) 1629 return (0); 1630 1631 for (i = 0; i < 6; i++) { 1632 if (g1->node_addr[i] != g2->node_addr[i]) 1633 return (0); 1634 } 1635 return (1); 1636 } 1637 1638 static void 1639 process_efi32(EFI_SYSTEM_TABLE32 *efi) 1640 { 1641 uint32_t entries; 1642 EFI_CONFIGURATION_TABLE32 *config; 1643 int i; 1644 1645 entries = efi->NumberOfTableEntries; 1646 config = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t) 1647 efi->ConfigurationTable; 1648 1649 for (i = 0; i < entries; i++) { 1650 if (dboot_same_guids(&config[i].VendorGuid, &smbios3)) { 1651 bi->bi_smbios = (native_ptr_t)(uintptr_t) 1652 config[i].VendorTable; 1653 } 1654 if (bi->bi_smbios == NULL && 1655 dboot_same_guids(&config[i].VendorGuid, &smbios)) { 1656 bi->bi_smbios = (native_ptr_t)(uintptr_t) 1657 config[i].VendorTable; 1658 } 1659 if (dboot_same_guids(&config[i].VendorGuid, &acpi2)) { 1660 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t) 1661 config[i].VendorTable; 1662 } 1663 if (bi->bi_acpi_rsdp == NULL && 1664 dboot_same_guids(&config[i].VendorGuid, &acpi1)) { 1665 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t) 1666 config[i].VendorTable; 1667 } 1668 } 1669 } 1670 1671 static void 1672 process_efi64(EFI_SYSTEM_TABLE64 *efi) 1673 { 1674 uint64_t entries; 1675 EFI_CONFIGURATION_TABLE64 *config; 1676 int i; 1677 1678 entries = efi->NumberOfTableEntries; 1679 config = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t) 1680 efi->ConfigurationTable; 1681 1682 for (i = 0; i < entries; i++) { 1683 if (dboot_same_guids(&config[i].VendorGuid, &smbios3)) { 1684 bi->bi_smbios = (native_ptr_t)(uintptr_t) 1685 config[i].VendorTable; 1686 } 1687 if (bi->bi_smbios == NULL && 1688 dboot_same_guids(&config[i].VendorGuid, &smbios)) { 1689 bi->bi_smbios = (native_ptr_t)(uintptr_t) 1690 config[i].VendorTable; 1691 } 1692 /* Prefer acpi v2+ over v1. */ 1693 if (dboot_same_guids(&config[i].VendorGuid, &acpi2)) { 1694 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t) 1695 config[i].VendorTable; 1696 } 1697 if (bi->bi_acpi_rsdp == NULL && 1698 dboot_same_guids(&config[i].VendorGuid, &acpi1)) { 1699 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t) 1700 config[i].VendorTable; 1701 } 1702 } 1703 } 1704 1705 static void 1706 dboot_multiboot_get_fwtables(void) 1707 { 1708 multiboot_tag_new_acpi_t *nacpitagp; 1709 multiboot_tag_old_acpi_t *oacpitagp; 1710 multiboot_tag_efi64_t *efi64tagp = NULL; 1711 multiboot_tag_efi32_t *efi32tagp = NULL; 1712 1713 /* no fw tables from multiboot 1 */ 1714 if (multiboot_version != 2) 1715 return; 1716 1717 efi64tagp = (multiboot_tag_efi64_t *) 1718 dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_EFI64); 1719 if (efi64tagp != NULL) { 1720 bi->bi_uefi_arch = XBI_UEFI_ARCH_64; 1721 bi->bi_uefi_systab = (native_ptr_t)(uintptr_t) 1722 efi64tagp->mb_pointer; 1723 process_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t) 1724 efi64tagp->mb_pointer); 1725 } else { 1726 efi32tagp = (multiboot_tag_efi32_t *) 1727 dboot_multiboot2_find_tag(mb2_info, 1728 MULTIBOOT_TAG_TYPE_EFI32); 1729 if (efi32tagp != NULL) { 1730 bi->bi_uefi_arch = XBI_UEFI_ARCH_32; 1731 bi->bi_uefi_systab = (native_ptr_t)(uintptr_t) 1732 efi32tagp->mb_pointer; 1733 process_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t) 1734 efi32tagp->mb_pointer); 1735 } 1736 } 1737 1738 /* 1739 * The ACPI RSDP can be found by scanning the BIOS memory areas or 1740 * from the EFI system table. The boot loader may pass in the address 1741 * it found the ACPI tables at. 1742 */ 1743 nacpitagp = (multiboot_tag_new_acpi_t *) 1744 dboot_multiboot2_find_tag(mb2_info, 1745 MULTIBOOT_TAG_TYPE_ACPI_NEW); 1746 oacpitagp = (multiboot_tag_old_acpi_t *) 1747 dboot_multiboot2_find_tag(mb2_info, 1748 MULTIBOOT_TAG_TYPE_ACPI_OLD); 1749 1750 if (nacpitagp != NULL) { 1751 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t) 1752 &nacpitagp->mb_rsdp[0]; 1753 } else if (oacpitagp != NULL) { 1754 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t) 1755 &oacpitagp->mb_rsdp[0]; 1756 } 1757 } 1758 1759 /* print out EFI version string with newline */ 1760 static void 1761 dboot_print_efi_version(uint32_t ver) 1762 { 1763 int rev; 1764 1765 dboot_printf("%d.", EFI_REV_MAJOR(ver)); 1766 1767 rev = EFI_REV_MINOR(ver); 1768 if ((rev % 10) != 0) { 1769 dboot_printf("%d.%d\n", rev / 10, rev % 10); 1770 } else { 1771 dboot_printf("%d\n", rev / 10); 1772 } 1773 } 1774 1775 static void 1776 print_efi32(EFI_SYSTEM_TABLE32 *efi) 1777 { 1778 uint16_t *data; 1779 EFI_CONFIGURATION_TABLE32 *conf; 1780 int i; 1781 1782 dboot_printf("EFI32 signature: %llx\n", 1783 (unsigned long long)efi->Hdr.Signature); 1784 dboot_printf("EFI system version: "); 1785 dboot_print_efi_version(efi->Hdr.Revision); 1786 dboot_printf("EFI system vendor: "); 1787 data = (uint16_t *)(uintptr_t)efi->FirmwareVendor; 1788 for (i = 0; data[i] != 0; i++) 1789 dboot_printf("%c", (char)data[i]); 1790 dboot_printf("\nEFI firmware revision: "); 1791 dboot_print_efi_version(efi->FirmwareRevision); 1792 dboot_printf("EFI system table number of entries: %d\n", 1793 efi->NumberOfTableEntries); 1794 conf = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t) 1795 efi->ConfigurationTable; 1796 for (i = 0; i < (int)efi->NumberOfTableEntries; i++) { 1797 dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i, 1798 conf[i].VendorGuid.time_low, 1799 conf[i].VendorGuid.time_mid, 1800 conf[i].VendorGuid.time_hi_and_version, 1801 conf[i].VendorGuid.clock_seq_hi_and_reserved, 1802 conf[i].VendorGuid.clock_seq_low); 1803 dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n", 1804 conf[i].VendorGuid.node_addr[0], 1805 conf[i].VendorGuid.node_addr[1], 1806 conf[i].VendorGuid.node_addr[2], 1807 conf[i].VendorGuid.node_addr[3], 1808 conf[i].VendorGuid.node_addr[4], 1809 conf[i].VendorGuid.node_addr[5]); 1810 } 1811 } 1812 1813 static void 1814 print_efi64(EFI_SYSTEM_TABLE64 *efi) 1815 { 1816 uint16_t *data; 1817 EFI_CONFIGURATION_TABLE64 *conf; 1818 int i; 1819 1820 dboot_printf("EFI64 signature: %llx\n", 1821 (unsigned long long)efi->Hdr.Signature); 1822 dboot_printf("EFI system version: "); 1823 dboot_print_efi_version(efi->Hdr.Revision); 1824 dboot_printf("EFI system vendor: "); 1825 data = (uint16_t *)(uintptr_t)efi->FirmwareVendor; 1826 for (i = 0; data[i] != 0; i++) 1827 dboot_printf("%c", (char)data[i]); 1828 dboot_printf("\nEFI firmware revision: "); 1829 dboot_print_efi_version(efi->FirmwareRevision); 1830 dboot_printf("EFI system table number of entries: %lld\n", 1831 efi->NumberOfTableEntries); 1832 conf = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t) 1833 efi->ConfigurationTable; 1834 for (i = 0; i < (int)efi->NumberOfTableEntries; i++) { 1835 dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i, 1836 conf[i].VendorGuid.time_low, 1837 conf[i].VendorGuid.time_mid, 1838 conf[i].VendorGuid.time_hi_and_version, 1839 conf[i].VendorGuid.clock_seq_hi_and_reserved, 1840 conf[i].VendorGuid.clock_seq_low); 1841 dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n", 1842 conf[i].VendorGuid.node_addr[0], 1843 conf[i].VendorGuid.node_addr[1], 1844 conf[i].VendorGuid.node_addr[2], 1845 conf[i].VendorGuid.node_addr[3], 1846 conf[i].VendorGuid.node_addr[4], 1847 conf[i].VendorGuid.node_addr[5]); 1848 } 1849 } 1850 #endif /* !__xpv */ 1851 1852 /* 1853 * Simple memory allocator, allocates aligned physical memory. 1854 * Note that startup_kernel() only allocates memory, never frees. 1855 * Memory usage just grows in an upward direction. 1856 */ 1857 static void * 1858 do_mem_alloc(uint32_t size, uint32_t align) 1859 { 1860 uint_t i; 1861 uint64_t best; 1862 uint64_t start; 1863 uint64_t end; 1864 1865 /* 1866 * make sure size is a multiple of pagesize 1867 */ 1868 size = RNDUP(size, MMU_PAGESIZE); 1869 next_avail_addr = RNDUP(next_avail_addr, align); 1870 1871 /* 1872 * XXPV fixme joe 1873 * 1874 * a really large bootarchive that causes you to run out of memory 1875 * may cause this to blow up 1876 */ 1877 /* LINTED E_UNEXPECTED_UINT_PROMOTION */ 1878 best = (uint64_t)-size; 1879 for (i = 0; i < memlists_used; ++i) { 1880 start = memlists[i].addr; 1881 #if defined(__xpv) 1882 start += mfn_base; 1883 #endif 1884 end = start + memlists[i].size; 1885 1886 /* 1887 * did we find the desired address? 1888 */ 1889 if (start <= next_avail_addr && next_avail_addr + size <= end) { 1890 best = next_avail_addr; 1891 goto done; 1892 } 1893 1894 /* 1895 * if not is this address the best so far? 1896 */ 1897 if (start > next_avail_addr && start < best && 1898 RNDUP(start, align) + size <= end) 1899 best = RNDUP(start, align); 1900 } 1901 1902 /* 1903 * We didn't find exactly the address we wanted, due to going off the 1904 * end of a memory region. Return the best found memory address. 1905 */ 1906 done: 1907 next_avail_addr = best + size; 1908 #if defined(__xpv) 1909 if (next_avail_addr > scratch_end) 1910 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: " 1911 "0x%lx", (ulong_t)next_avail_addr, 1912 (ulong_t)scratch_end); 1913 #endif 1914 (void) memset((void *)(uintptr_t)best, 0, size); 1915 return ((void *)(uintptr_t)best); 1916 } 1917 1918 void * 1919 mem_alloc(uint32_t size) 1920 { 1921 return (do_mem_alloc(size, MMU_PAGESIZE)); 1922 } 1923 1924 1925 /* 1926 * Build page tables to map all of memory used so far as well as the kernel. 1927 */ 1928 static void 1929 build_page_tables(void) 1930 { 1931 uint32_t psize; 1932 uint32_t level; 1933 uint32_t off; 1934 uint64_t start; 1935 #if !defined(__xpv) 1936 uint32_t i; 1937 uint64_t end; 1938 #endif /* __xpv */ 1939 1940 /* 1941 * If we're on metal, we need to create the top level pagetable. 1942 */ 1943 #if defined(__xpv) 1944 top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base; 1945 #else /* __xpv */ 1946 top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); 1947 #endif /* __xpv */ 1948 DBG((uintptr_t)top_page_table); 1949 1950 /* 1951 * Determine if we'll use large mappings for kernel, then map it. 1952 */ 1953 if (largepage_support) { 1954 psize = lpagesize; 1955 level = 1; 1956 } else { 1957 psize = MMU_PAGESIZE; 1958 level = 0; 1959 } 1960 1961 DBG_MSG("Mapping kernel\n"); 1962 DBG(ktext_phys); 1963 DBG(target_kernel_text); 1964 DBG(ksize); 1965 DBG(psize); 1966 for (off = 0; off < ksize; off += psize) 1967 map_pa_at_va(ktext_phys + off, target_kernel_text + off, level); 1968 1969 /* 1970 * The kernel will need a 1 page window to work with page tables 1971 */ 1972 bi->bi_pt_window = (native_ptr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); 1973 DBG(bi->bi_pt_window); 1974 bi->bi_pte_to_pt_window = 1975 (native_ptr_t)(uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0); 1976 DBG(bi->bi_pte_to_pt_window); 1977 1978 #if defined(__xpv) 1979 if (!DOMAIN_IS_INITDOMAIN(xen_info)) { 1980 /* If this is a domU we're done. */ 1981 DBG_MSG("\nPage tables constructed\n"); 1982 return; 1983 } 1984 #endif /* __xpv */ 1985 1986 /* 1987 * We need 1:1 mappings for the lower 1M of memory to access 1988 * BIOS tables used by a couple of drivers during boot. 1989 * 1990 * The following code works because our simple memory allocator 1991 * only grows usage in an upwards direction. 1992 * 1993 * Note that by this point in boot some mappings for low memory 1994 * may already exist because we've already accessed device in low 1995 * memory. (Specifically the video frame buffer and keyboard 1996 * status ports.) If we're booting on raw hardware then GRUB 1997 * created these mappings for us. If we're booting under a 1998 * hypervisor then we went ahead and remapped these devices into 1999 * memory allocated within dboot itself. 2000 */ 2001 if (map_debug) 2002 dboot_printf("1:1 map pa=0..1Meg\n"); 2003 for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) { 2004 #if defined(__xpv) 2005 map_ma_at_va(start, start, 0); 2006 #else /* __xpv */ 2007 map_pa_at_va(start, start, 0); 2008 #endif /* __xpv */ 2009 } 2010 2011 #if !defined(__xpv) 2012 2013 for (i = 0; i < memlists_used; ++i) { 2014 start = memlists[i].addr; 2015 end = start + memlists[i].size; 2016 2017 if (map_debug) 2018 dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n", 2019 start, end); 2020 while (start < end && start < next_avail_addr) { 2021 map_pa_at_va(start, start, 0); 2022 start += MMU_PAGESIZE; 2023 } 2024 if (start >= next_avail_addr) 2025 break; 2026 } 2027 2028 /* 2029 * Map framebuffer memory as PT_NOCACHE as this is memory from a 2030 * device and therefore must not be cached. 2031 */ 2032 if (bi->bi_framebuffer != NULL) { 2033 multiboot_tag_framebuffer_t *fb; 2034 fb = (multiboot_tag_framebuffer_t *)(uintptr_t) 2035 bi->bi_framebuffer; 2036 2037 start = fb->framebuffer_common.framebuffer_addr; 2038 end = start + fb->framebuffer_common.framebuffer_height * 2039 fb->framebuffer_common.framebuffer_pitch; 2040 2041 if (map_debug) 2042 dboot_printf("FB 1:1 map pa=%" PRIx64 "..%" PRIx64 "\n", 2043 start, end); 2044 pte_bits |= PT_NOCACHE; 2045 while (start < end) { 2046 map_pa_at_va(start, start, 0); 2047 start += MMU_PAGESIZE; 2048 } 2049 pte_bits &= ~PT_NOCACHE; 2050 } 2051 #endif /* !__xpv */ 2052 2053 DBG_MSG("\nPage tables constructed\n"); 2054 } 2055 2056 #define NO_MULTIBOOT \ 2057 "multiboot is no longer used to boot the Solaris Operating System.\n\ 2058 The grub entry should be changed to:\n\ 2059 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\ 2060 module$ /platform/i86pc/$ISADIR/boot_archive\n\ 2061 See http://illumos.org/msg/SUNOS-8000-AK for details.\n" 2062 2063 static void 2064 dboot_init_xboot_consinfo(void) 2065 { 2066 uintptr_t addr; 2067 /* 2068 * boot info must be 16 byte aligned for 64 bit kernel ABI 2069 */ 2070 addr = (uintptr_t)boot_info; 2071 addr = (addr + 0xf) & ~0xf; 2072 bi = (struct xboot_info *)addr; 2073 2074 #if !defined(__xpv) 2075 switch (multiboot_version) { 2076 case 1: 2077 dboot_multiboot1_xboot_consinfo(); 2078 break; 2079 case 2: 2080 dboot_multiboot2_xboot_consinfo(); 2081 break; 2082 default: 2083 dboot_panic("Unknown multiboot version: %d\n", 2084 multiboot_version); 2085 break; 2086 } 2087 /* 2088 * Lookup environment module for the console. Complete module list 2089 * will be built after console setup. 2090 */ 2091 dboot_find_env(); 2092 #endif 2093 } 2094 2095 /* 2096 * Set up basic data from the boot loader. 2097 * The load_addr is part of AOUT kludge setup in dboot_grub.s, to support 2098 * 32-bit dboot code setup used to set up and start 64-bit kernel. 2099 * AOUT kludge does allow 32-bit boot loader, such as grub1, to load and 2100 * start 64-bit illumos kernel. 2101 */ 2102 static void 2103 dboot_loader_init(void) 2104 { 2105 #if !defined(__xpv) 2106 mb_info = NULL; 2107 mb2_info = NULL; 2108 2109 switch (mb_magic) { 2110 case MB_BOOTLOADER_MAGIC: 2111 multiboot_version = 1; 2112 mb_info = (multiboot_info_t *)(uintptr_t)mb_addr; 2113 #if defined(_BOOT_TARGET_amd64) 2114 load_addr = mb_header.load_addr; 2115 #endif 2116 break; 2117 2118 case MULTIBOOT2_BOOTLOADER_MAGIC: 2119 multiboot_version = 2; 2120 mb2_info = (multiboot2_info_header_t *)(uintptr_t)mb_addr; 2121 mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mb2_info); 2122 #if defined(_BOOT_TARGET_amd64) 2123 load_addr = mb2_load_addr; 2124 #endif 2125 break; 2126 2127 default: 2128 dboot_panic("Unknown bootloader magic: 0x%x\n", mb_magic); 2129 break; 2130 } 2131 #endif /* !defined(__xpv) */ 2132 } 2133 2134 /* Extract the kernel command line from [multi]boot information. */ 2135 static char * 2136 dboot_loader_cmdline(void) 2137 { 2138 char *line = NULL; 2139 2140 #if defined(__xpv) 2141 line = (char *)xen_info->cmd_line; 2142 #else /* __xpv */ 2143 2144 switch (multiboot_version) { 2145 case 1: 2146 if (mb_info->flags & MB_INFO_CMDLINE) 2147 line = (char *)mb_info->cmdline; 2148 break; 2149 2150 case 2: 2151 line = dboot_multiboot2_cmdline(mb2_info); 2152 break; 2153 2154 default: 2155 dboot_panic("Unknown multiboot version: %d\n", 2156 multiboot_version); 2157 break; 2158 } 2159 2160 #endif /* __xpv */ 2161 2162 /* 2163 * Make sure we have valid pointer so the string operations 2164 * will not crash us. 2165 */ 2166 if (line == NULL) 2167 line = ""; 2168 2169 return (line); 2170 } 2171 2172 static char * 2173 dboot_loader_name(void) 2174 { 2175 #if defined(__xpv) 2176 return (NULL); 2177 #else /* __xpv */ 2178 multiboot_tag_string_t *tag; 2179 2180 switch (multiboot_version) { 2181 case 1: 2182 return ((char *)mb_info->boot_loader_name); 2183 2184 case 2: 2185 tag = dboot_multiboot2_find_tag(mb2_info, 2186 MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME); 2187 return (tag->mb_string); 2188 default: 2189 dboot_panic("Unknown multiboot version: %d\n", 2190 multiboot_version); 2191 break; 2192 } 2193 2194 return (NULL); 2195 #endif /* __xpv */ 2196 } 2197 2198 /* 2199 * startup_kernel has a pretty simple job. It builds pagetables which reflect 2200 * 1:1 mappings for all memory in use. It then also adds mappings for 2201 * the kernel nucleus at virtual address of target_kernel_text using large page 2202 * mappings. The page table pages are also accessible at 1:1 mapped 2203 * virtual addresses. 2204 */ 2205 /*ARGSUSED*/ 2206 void 2207 startup_kernel(void) 2208 { 2209 char *cmdline; 2210 char *bootloader; 2211 #if defined(__xpv) 2212 physdev_set_iopl_t set_iopl; 2213 #endif /* __xpv */ 2214 2215 if (dboot_debug == 1) 2216 bcons_init(NULL); /* Set very early console to ttya. */ 2217 dboot_loader_init(); 2218 /* 2219 * At this point we are executing in a 32 bit real mode. 2220 */ 2221 2222 bootloader = dboot_loader_name(); 2223 cmdline = dboot_loader_cmdline(); 2224 2225 #if defined(__xpv) 2226 /* 2227 * For dom0, before we initialize the console subsystem we'll 2228 * need to enable io operations, so set I/O priveldge level to 1. 2229 */ 2230 if (DOMAIN_IS_INITDOMAIN(xen_info)) { 2231 set_iopl.iopl = 1; 2232 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 2233 } 2234 #endif /* __xpv */ 2235 2236 dboot_init_xboot_consinfo(); 2237 bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline; 2238 bcons_init(bi); /* Now we can set the real console. */ 2239 2240 prom_debug = (find_boot_prop("prom_debug") != NULL); 2241 map_debug = (find_boot_prop("map_debug") != NULL); 2242 2243 #if !defined(__xpv) 2244 dboot_multiboot_get_fwtables(); 2245 #endif 2246 DBG_MSG("\n\nillumos prekernel set: "); 2247 DBG_MSG(cmdline); 2248 DBG_MSG("\n"); 2249 2250 if (bootloader != NULL && prom_debug) { 2251 dboot_printf("Kernel loaded by: %s\n", bootloader); 2252 #if !defined(__xpv) 2253 dboot_printf("Using multiboot %d boot protocol.\n", 2254 multiboot_version); 2255 #endif 2256 } 2257 2258 if (strstr(cmdline, "multiboot") != NULL) { 2259 dboot_panic(NO_MULTIBOOT); 2260 } 2261 2262 DBG((uintptr_t)bi); 2263 #if !defined(__xpv) 2264 DBG((uintptr_t)mb_info); 2265 DBG((uintptr_t)mb2_info); 2266 if (mb2_info != NULL) 2267 DBG(mb2_info->mbi_total_size); 2268 DBG(bi->bi_acpi_rsdp); 2269 DBG(bi->bi_smbios); 2270 DBG(bi->bi_uefi_arch); 2271 DBG(bi->bi_uefi_systab); 2272 2273 if (bi->bi_uefi_systab && prom_debug) { 2274 if (bi->bi_uefi_arch == XBI_UEFI_ARCH_64) { 2275 print_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t) 2276 bi->bi_uefi_systab); 2277 } else { 2278 print_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t) 2279 bi->bi_uefi_systab); 2280 } 2281 } 2282 #endif 2283 2284 /* 2285 * Need correct target_kernel_text value 2286 */ 2287 #if defined(_BOOT_TARGET_amd64) 2288 target_kernel_text = KERNEL_TEXT_amd64; 2289 #elif defined(__xpv) 2290 target_kernel_text = KERNEL_TEXT_i386_xpv; 2291 #else 2292 target_kernel_text = KERNEL_TEXT_i386; 2293 #endif 2294 DBG(target_kernel_text); 2295 2296 #if defined(__xpv) 2297 2298 /* 2299 * XXPV Derive this stuff from CPUID / what the hypervisor has enabled 2300 */ 2301 2302 #if defined(_BOOT_TARGET_amd64) 2303 /* 2304 * 64-bit hypervisor. 2305 */ 2306 amd64_support = 1; 2307 pae_support = 1; 2308 2309 #else /* _BOOT_TARGET_amd64 */ 2310 2311 /* 2312 * See if we are running on a PAE Hypervisor 2313 */ 2314 { 2315 xen_capabilities_info_t caps; 2316 2317 if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0) 2318 dboot_panic("HYPERVISOR_xen_version(caps) failed"); 2319 caps[sizeof (caps) - 1] = 0; 2320 if (prom_debug) 2321 dboot_printf("xen capabilities %s\n", caps); 2322 if (strstr(caps, "x86_32p") != NULL) 2323 pae_support = 1; 2324 } 2325 2326 #endif /* _BOOT_TARGET_amd64 */ 2327 { 2328 xen_platform_parameters_t p; 2329 2330 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0) 2331 dboot_panic("HYPERVISOR_xen_version(parms) failed"); 2332 DBG(p.virt_start); 2333 mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start); 2334 } 2335 2336 /* 2337 * The hypervisor loads stuff starting at 1Gig 2338 */ 2339 mfn_base = ONE_GIG; 2340 DBG(mfn_base); 2341 2342 /* 2343 * enable writable page table mode for the hypervisor 2344 */ 2345 if (HYPERVISOR_vm_assist(VMASST_CMD_enable, 2346 VMASST_TYPE_writable_pagetables) < 0) 2347 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed"); 2348 2349 /* 2350 * check for NX support 2351 */ 2352 if (pae_support) { 2353 uint32_t eax = 0x80000000; 2354 uint32_t edx = get_cpuid_edx(&eax); 2355 2356 if (eax >= 0x80000001) { 2357 eax = 0x80000001; 2358 edx = get_cpuid_edx(&eax); 2359 if (edx & CPUID_AMD_EDX_NX) 2360 NX_support = 1; 2361 } 2362 } 2363 2364 #if !defined(_BOOT_TARGET_amd64) 2365 2366 /* 2367 * The 32-bit hypervisor uses segmentation to protect itself from 2368 * guests. This means when a guest attempts to install a flat 4GB 2369 * code or data descriptor the 32-bit hypervisor will protect itself 2370 * by silently shrinking the segment such that if the guest attempts 2371 * any access where the hypervisor lives a #gp fault is generated. 2372 * The problem is that some applications expect a full 4GB flat 2373 * segment for their current thread pointer and will use negative 2374 * offset segment wrap around to access data. TLS support in linux 2375 * brand is one example of this. 2376 * 2377 * The 32-bit hypervisor can catch the #gp fault in these cases 2378 * and emulate the access without passing the #gp fault to the guest 2379 * but only if VMASST_TYPE_4gb_segments is explicitly turned on. 2380 * Seems like this should have been the default. 2381 * Either way, we want the hypervisor -- and not Solaris -- to deal 2382 * to deal with emulating these accesses. 2383 */ 2384 if (HYPERVISOR_vm_assist(VMASST_CMD_enable, 2385 VMASST_TYPE_4gb_segments) < 0) 2386 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed"); 2387 #endif /* !_BOOT_TARGET_amd64 */ 2388 2389 #else /* __xpv */ 2390 2391 /* 2392 * use cpuid to enable MMU features 2393 */ 2394 if (have_cpuid()) { 2395 uint32_t eax, edx; 2396 2397 eax = 1; 2398 edx = get_cpuid_edx(&eax); 2399 if (edx & CPUID_INTC_EDX_PSE) 2400 largepage_support = 1; 2401 if (edx & CPUID_INTC_EDX_PGE) 2402 pge_support = 1; 2403 if (edx & CPUID_INTC_EDX_PAE) 2404 pae_support = 1; 2405 2406 eax = 0x80000000; 2407 edx = get_cpuid_edx(&eax); 2408 if (eax >= 0x80000001) { 2409 eax = 0x80000001; 2410 edx = get_cpuid_edx(&eax); 2411 if (edx & CPUID_AMD_EDX_LM) 2412 amd64_support = 1; 2413 if (edx & CPUID_AMD_EDX_NX) 2414 NX_support = 1; 2415 } 2416 } else { 2417 dboot_printf("cpuid not supported\n"); 2418 } 2419 #endif /* __xpv */ 2420 2421 2422 #if defined(_BOOT_TARGET_amd64) 2423 if (amd64_support == 0) 2424 dboot_panic("long mode not supported, rebooting"); 2425 else if (pae_support == 0) 2426 dboot_panic("long mode, but no PAE; rebooting"); 2427 #else 2428 /* 2429 * Allow the command line to over-ride use of PAE for 32 bit. 2430 */ 2431 if (strstr(cmdline, "disablePAE=true") != NULL) { 2432 pae_support = 0; 2433 NX_support = 0; 2434 amd64_support = 0; 2435 } 2436 #endif 2437 2438 /* 2439 * initialize the simple memory allocator 2440 */ 2441 init_mem_alloc(); 2442 2443 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64) 2444 /* 2445 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory 2446 */ 2447 if (max_mem < FOUR_GIG && NX_support == 0) 2448 pae_support = 0; 2449 #endif 2450 2451 /* 2452 * configure mmu information 2453 */ 2454 if (pae_support) { 2455 shift_amt = shift_amt_pae; 2456 ptes_per_table = 512; 2457 pte_size = 8; 2458 lpagesize = TWO_MEG; 2459 #if defined(_BOOT_TARGET_amd64) 2460 top_level = 3; 2461 #else 2462 top_level = 2; 2463 #endif 2464 } else { 2465 pae_support = 0; 2466 NX_support = 0; 2467 shift_amt = shift_amt_nopae; 2468 ptes_per_table = 1024; 2469 pte_size = 4; 2470 lpagesize = FOUR_MEG; 2471 top_level = 1; 2472 } 2473 2474 DBG(pge_support); 2475 DBG(NX_support); 2476 DBG(largepage_support); 2477 DBG(amd64_support); 2478 DBG(top_level); 2479 DBG(pte_size); 2480 DBG(ptes_per_table); 2481 DBG(lpagesize); 2482 2483 #if defined(__xpv) 2484 ktext_phys = ONE_GIG; /* from UNIX Mapfile */ 2485 #else 2486 ktext_phys = FOUR_MEG; /* from UNIX Mapfile */ 2487 #endif 2488 2489 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64) 2490 /* 2491 * For grub, copy kernel bits from the ELF64 file to final place. 2492 */ 2493 DBG_MSG("\nAllocating nucleus pages.\n"); 2494 ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG); 2495 2496 if (ktext_phys == 0) 2497 dboot_panic("failed to allocate aligned kernel memory"); 2498 DBG(load_addr); 2499 if (dboot_elfload64(load_addr) != 0) 2500 dboot_panic("failed to parse kernel ELF image, rebooting"); 2501 #endif 2502 2503 DBG(ktext_phys); 2504 2505 /* 2506 * Allocate page tables. 2507 */ 2508 build_page_tables(); 2509 2510 /* 2511 * return to assembly code to switch to running kernel 2512 */ 2513 entry_addr_low = (uint32_t)target_kernel_text; 2514 DBG(entry_addr_low); 2515 bi->bi_use_largepage = largepage_support; 2516 bi->bi_use_pae = pae_support; 2517 bi->bi_use_pge = pge_support; 2518 bi->bi_use_nx = NX_support; 2519 2520 #if defined(__xpv) 2521 2522 bi->bi_next_paddr = next_avail_addr - mfn_base; 2523 DBG(bi->bi_next_paddr); 2524 bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr; 2525 DBG(bi->bi_next_vaddr); 2526 2527 /* 2528 * unmap unused pages in start area to make them available for DMA 2529 */ 2530 while (next_avail_addr < scratch_end) { 2531 (void) HYPERVISOR_update_va_mapping(next_avail_addr, 2532 0, UVMF_INVLPG | UVMF_LOCAL); 2533 next_avail_addr += MMU_PAGESIZE; 2534 } 2535 2536 bi->bi_xen_start_info = (native_ptr_t)(uintptr_t)xen_info; 2537 DBG((uintptr_t)HYPERVISOR_shared_info); 2538 bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info; 2539 bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base; 2540 2541 #else /* __xpv */ 2542 2543 bi->bi_next_paddr = next_avail_addr; 2544 DBG(bi->bi_next_paddr); 2545 bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr; 2546 DBG(bi->bi_next_vaddr); 2547 bi->bi_mb_version = multiboot_version; 2548 2549 switch (multiboot_version) { 2550 case 1: 2551 bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb_info; 2552 break; 2553 case 2: 2554 bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb2_info; 2555 break; 2556 default: 2557 dboot_panic("Unknown multiboot version: %d\n", 2558 multiboot_version); 2559 break; 2560 } 2561 bi->bi_top_page_table = (uintptr_t)top_page_table; 2562 2563 #endif /* __xpv */ 2564 2565 bi->bi_kseg_size = FOUR_MEG; 2566 DBG(bi->bi_kseg_size); 2567 2568 #ifndef __xpv 2569 if (map_debug) 2570 dump_tables(); 2571 #endif 2572 2573 DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n"); 2574 } 2575