1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * Copyright 2013 Joyent, Inc. All rights reserved. 27 */ 28 29 30 #include <sys/types.h> 31 #include <sys/machparam.h> 32 #include <sys/x86_archext.h> 33 #include <sys/systm.h> 34 #include <sys/mach_mmu.h> 35 #include <sys/multiboot.h> 36 #include <sys/multiboot2.h> 37 #include <sys/multiboot2_impl.h> 38 #include <sys/sysmacros.h> 39 #include <sys/sha1.h> 40 #include <util/string.h> 41 #include <util/strtolctype.h> 42 #include <sys/efi.h> 43 44 #if defined(__xpv) 45 46 #include <sys/hypervisor.h> 47 uintptr_t xen_virt_start; 48 pfn_t *mfn_to_pfn_mapping; 49 50 #else /* !__xpv */ 51 52 extern multiboot_header_t mb_header; 53 extern uint32_t mb2_load_addr; 54 extern int have_cpuid(void); 55 56 #endif /* !__xpv */ 57 58 #include <sys/inttypes.h> 59 #include <sys/bootinfo.h> 60 #include <sys/mach_mmu.h> 61 #include <sys/boot_console.h> 62 63 #include "dboot_asm.h" 64 #include "dboot_printf.h" 65 #include "dboot_xboot.h" 66 #include "dboot_elfload.h" 67 68 #define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2) 69 70 /* 71 * This file contains code that runs to transition us from either a multiboot 72 * compliant loader (32 bit non-paging) or a XPV domain loader to 73 * regular kernel execution. Its task is to setup the kernel memory image 74 * and page tables. 75 * 76 * The code executes as: 77 * - 32 bits under GRUB (for 32 or 64 bit Solaris) 78 * - a 32 bit program for the 32-bit PV hypervisor 79 * - a 64 bit program for the 64-bit PV hypervisor (at least for now) 80 * 81 * Under the PV hypervisor, we must create mappings for any memory beyond the 82 * initial start of day allocation (such as the kernel itself). 83 * 84 * When on the metal, the mapping between maddr_t and paddr_t is 1:1. 85 * Since we are running in real mode, so all such memory is accessible. 86 */ 87 88 /* 89 * Standard bits used in PTE (page level) and PTP (internal levels) 90 */ 91 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER; 92 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST; 93 94 /* 95 * This is the target addresses (physical) where the kernel text and data 96 * nucleus pages will be unpacked. On the hypervisor this is actually a 97 * virtual address. 98 */ 99 paddr_t ktext_phys; 100 uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */ 101 102 static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */ 103 104 /* 105 * The stack is setup in assembler before entering startup_kernel() 106 */ 107 char stack_space[STACK_SIZE]; 108 109 /* 110 * Used to track physical memory allocation 111 */ 112 static paddr_t next_avail_addr = 0; 113 114 #if defined(__xpv) 115 /* 116 * Additional information needed for hypervisor memory allocation. 117 * Only memory up to scratch_end is mapped by page tables. 118 * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so 119 * to derive a pfn from a pointer, you subtract mfn_base. 120 */ 121 122 static paddr_t scratch_end = 0; /* we can't write all of mem here */ 123 static paddr_t mfn_base; /* addr corresponding to mfn_list[0] */ 124 start_info_t *xen_info; 125 126 #else /* __xpv */ 127 128 /* 129 * If on the metal, then we have a multiboot loader. 130 */ 131 uint32_t mb_magic; /* magic from boot loader */ 132 uint32_t mb_addr; /* multiboot info package from loader */ 133 int multiboot_version; 134 multiboot_info_t *mb_info; 135 multiboot2_info_header_t *mb2_info; 136 multiboot_tag_mmap_t *mb2_mmap_tagp; 137 int num_entries; /* mmap entry count */ 138 boolean_t num_entries_set; /* is mmap entry count set */ 139 uintptr_t load_addr; 140 141 /* can not be automatic variables because of alignment */ 142 static efi_guid_t smbios3 = SMBIOS3_TABLE_GUID; 143 static efi_guid_t smbios = SMBIOS_TABLE_GUID; 144 static efi_guid_t acpi2 = EFI_ACPI_TABLE_GUID; 145 static efi_guid_t acpi1 = ACPI_10_TABLE_GUID; 146 #endif /* __xpv */ 147 148 /* 149 * This contains information passed to the kernel 150 */ 151 struct xboot_info boot_info[2]; /* extra space to fix alignement for amd64 */ 152 struct xboot_info *bi; 153 154 /* 155 * Page table and memory stuff. 156 */ 157 static paddr_t max_mem; /* maximum memory address */ 158 159 /* 160 * Information about processor MMU 161 */ 162 int amd64_support = 0; 163 int largepage_support = 0; 164 int pae_support = 0; 165 int pge_support = 0; 166 int NX_support = 0; 167 168 /* 169 * Low 32 bits of kernel entry address passed back to assembler. 170 * When running a 64 bit kernel, the high 32 bits are 0xffffffff. 171 */ 172 uint32_t entry_addr_low; 173 174 /* 175 * Memlists for the kernel. We shouldn't need a lot of these. 176 */ 177 #define MAX_MEMLIST (50) 178 struct boot_memlist memlists[MAX_MEMLIST]; 179 uint_t memlists_used = 0; 180 struct boot_memlist pcimemlists[MAX_MEMLIST]; 181 uint_t pcimemlists_used = 0; 182 struct boot_memlist rsvdmemlists[MAX_MEMLIST]; 183 uint_t rsvdmemlists_used = 0; 184 185 /* 186 * This should match what's in the bootloader. It's arbitrary, but GRUB 187 * in particular has limitations on how much space it can use before it 188 * stops working properly. This should be enough. 189 */ 190 struct boot_modules modules[MAX_BOOT_MODULES]; 191 uint_t modules_used = 0; 192 193 #ifdef __xpv 194 /* 195 * Xen strips the size field out of the mb_memory_map_t, see struct e820entry 196 * definition in Xen source. 197 */ 198 typedef struct { 199 uint32_t base_addr_low; 200 uint32_t base_addr_high; 201 uint32_t length_low; 202 uint32_t length_high; 203 uint32_t type; 204 } mmap_t; 205 206 /* 207 * There is 512KB of scratch area after the boot stack page. 208 * We'll use that for everything except the kernel nucleus pages which are too 209 * big to fit there and are allocated last anyway. 210 */ 211 #define MAXMAPS 100 212 static mmap_t map_buffer[MAXMAPS]; 213 #else 214 typedef mb_memory_map_t mmap_t; 215 #endif 216 217 /* 218 * Debugging macros 219 */ 220 uint_t prom_debug = 0; 221 uint_t map_debug = 0; 222 223 static char noname[2] = "-"; 224 225 /* 226 * Either hypervisor-specific or grub-specific code builds the initial 227 * memlists. This code does the sort/merge/link for final use. 228 */ 229 static void 230 sort_physinstall(void) 231 { 232 int i; 233 #if !defined(__xpv) 234 int j; 235 struct boot_memlist tmp; 236 237 /* 238 * Now sort the memlists, in case they weren't in order. 239 * Yeah, this is a bubble sort; small, simple and easy to get right. 240 */ 241 DBG_MSG("Sorting phys-installed list\n"); 242 for (j = memlists_used - 1; j > 0; --j) { 243 for (i = 0; i < j; ++i) { 244 if (memlists[i].addr < memlists[i + 1].addr) 245 continue; 246 tmp = memlists[i]; 247 memlists[i] = memlists[i + 1]; 248 memlists[i + 1] = tmp; 249 } 250 } 251 252 /* 253 * Merge any memlists that don't have holes between them. 254 */ 255 for (i = 0; i <= memlists_used - 1; ++i) { 256 if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr) 257 continue; 258 259 if (prom_debug) 260 dboot_printf( 261 "merging mem segs %" PRIx64 "...%" PRIx64 262 " w/ %" PRIx64 "...%" PRIx64 "\n", 263 memlists[i].addr, 264 memlists[i].addr + memlists[i].size, 265 memlists[i + 1].addr, 266 memlists[i + 1].addr + memlists[i + 1].size); 267 268 memlists[i].size += memlists[i + 1].size; 269 for (j = i + 1; j < memlists_used - 1; ++j) 270 memlists[j] = memlists[j + 1]; 271 --memlists_used; 272 DBG(memlists_used); 273 --i; /* after merging we need to reexamine, so do this */ 274 } 275 #endif /* __xpv */ 276 277 if (prom_debug) { 278 dboot_printf("\nFinal memlists:\n"); 279 for (i = 0; i < memlists_used; ++i) { 280 dboot_printf("\t%d: addr=%" PRIx64 " size=%" 281 PRIx64 "\n", i, memlists[i].addr, memlists[i].size); 282 } 283 } 284 285 /* 286 * link together the memlists with native size pointers 287 */ 288 memlists[0].next = 0; 289 memlists[0].prev = 0; 290 for (i = 1; i < memlists_used; ++i) { 291 memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1); 292 memlists[i].next = 0; 293 memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i); 294 } 295 bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists; 296 DBG(bi->bi_phys_install); 297 } 298 299 /* 300 * build bios reserved memlists 301 */ 302 static void 303 build_rsvdmemlists(void) 304 { 305 int i; 306 307 rsvdmemlists[0].next = 0; 308 rsvdmemlists[0].prev = 0; 309 for (i = 1; i < rsvdmemlists_used; ++i) { 310 rsvdmemlists[i].prev = 311 (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1); 312 rsvdmemlists[i].next = 0; 313 rsvdmemlists[i - 1].next = 314 (native_ptr_t)(uintptr_t)(rsvdmemlists + i); 315 } 316 bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists; 317 DBG(bi->bi_rsvdmem); 318 } 319 320 #if defined(__xpv) 321 322 /* 323 * halt on the hypervisor after a delay to drain console output 324 */ 325 void 326 dboot_halt(void) 327 { 328 uint_t i = 10000; 329 330 while (--i) 331 (void) HYPERVISOR_yield(); 332 (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff); 333 } 334 335 /* 336 * From a machine address, find the corresponding pseudo-physical address. 337 * Pseudo-physical address are contiguous and run from mfn_base in each VM. 338 * Machine addresses are the real underlying hardware addresses. 339 * These are needed for page table entries. Note that this routine is 340 * poorly protected. A bad value of "ma" will cause a page fault. 341 */ 342 paddr_t 343 ma_to_pa(maddr_t ma) 344 { 345 ulong_t pgoff = ma & MMU_PAGEOFFSET; 346 ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)]; 347 paddr_t pa; 348 349 if (pfn >= xen_info->nr_pages) 350 return (-(paddr_t)1); 351 pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff; 352 #ifdef DEBUG 353 if (ma != pa_to_ma(pa)) 354 dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", " 355 "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa)); 356 #endif 357 return (pa); 358 } 359 360 /* 361 * From a pseudo-physical address, find the corresponding machine address. 362 */ 363 maddr_t 364 pa_to_ma(paddr_t pa) 365 { 366 pfn_t pfn; 367 ulong_t mfn; 368 369 pfn = mmu_btop(pa - mfn_base); 370 if (pa < mfn_base || pfn >= xen_info->nr_pages) 371 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa); 372 mfn = ((ulong_t *)xen_info->mfn_list)[pfn]; 373 #ifdef DEBUG 374 if (mfn_to_pfn_mapping[mfn] != pfn) 375 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n", 376 pfn, mfn, mfn_to_pfn_mapping[mfn]); 377 #endif 378 return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET)); 379 } 380 381 #endif /* __xpv */ 382 383 x86pte_t 384 get_pteval(paddr_t table, uint_t index) 385 { 386 if (pae_support) 387 return (((x86pte_t *)(uintptr_t)table)[index]); 388 return (((x86pte32_t *)(uintptr_t)table)[index]); 389 } 390 391 /*ARGSUSED*/ 392 void 393 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval) 394 { 395 #ifdef __xpv 396 mmu_update_t t; 397 maddr_t mtable = pa_to_ma(table); 398 int retcnt; 399 400 t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE; 401 t.val = pteval; 402 if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1) 403 dboot_panic("HYPERVISOR_mmu_update() failed"); 404 #else /* __xpv */ 405 uintptr_t tab_addr = (uintptr_t)table; 406 407 if (pae_support) 408 ((x86pte_t *)tab_addr)[index] = pteval; 409 else 410 ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval; 411 if (level == top_level && level == 2) 412 reload_cr3(); 413 #endif /* __xpv */ 414 } 415 416 paddr_t 417 make_ptable(x86pte_t *pteval, uint_t level) 418 { 419 paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); 420 421 if (level == top_level && level == 2) 422 *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID; 423 else 424 *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits; 425 426 #ifdef __xpv 427 /* Remove write permission to the new page table. */ 428 if (HYPERVISOR_update_va_mapping(new_table, 429 *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL)) 430 dboot_panic("HYP_update_va_mapping error"); 431 #endif 432 433 if (map_debug) 434 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%" 435 PRIx64 "\n", level, (ulong_t)new_table, *pteval); 436 return (new_table); 437 } 438 439 x86pte_t * 440 map_pte(paddr_t table, uint_t index) 441 { 442 return ((x86pte_t *)(uintptr_t)(table + index * pte_size)); 443 } 444 445 /* 446 * dump out the contents of page tables... 447 */ 448 static void 449 dump_tables(void) 450 { 451 uint_t save_index[4]; /* for recursion */ 452 char *save_table[4]; /* for recursion */ 453 uint_t l; 454 uint64_t va; 455 uint64_t pgsize; 456 int index; 457 int i; 458 x86pte_t pteval; 459 char *table; 460 static char *tablist = "\t\t\t"; 461 char *tabs = tablist + 3 - top_level; 462 uint_t pa, pa1; 463 #if !defined(__xpv) 464 #define maddr_t paddr_t 465 #endif /* !__xpv */ 466 467 dboot_printf("Finished pagetables:\n"); 468 table = (char *)(uintptr_t)top_page_table; 469 l = top_level; 470 va = 0; 471 for (index = 0; index < ptes_per_table; ++index) { 472 pgsize = 1ull << shift_amt[l]; 473 if (pae_support) 474 pteval = ((x86pte_t *)table)[index]; 475 else 476 pteval = ((x86pte32_t *)table)[index]; 477 if (pteval == 0) 478 goto next_entry; 479 480 dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64, 481 tabs + l, (void *)table, index, (uint64_t)pteval, va); 482 pa = ma_to_pa(pteval & MMU_PAGEMASK); 483 dboot_printf(" physaddr=%x\n", pa); 484 485 /* 486 * Don't try to walk hypervisor private pagetables 487 */ 488 if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) { 489 save_table[l] = table; 490 save_index[l] = index; 491 --l; 492 index = -1; 493 table = (char *)(uintptr_t) 494 ma_to_pa(pteval & MMU_PAGEMASK); 495 goto recursion; 496 } 497 498 /* 499 * shorten dump for consecutive mappings 500 */ 501 for (i = 1; index + i < ptes_per_table; ++i) { 502 if (pae_support) 503 pteval = ((x86pte_t *)table)[index + i]; 504 else 505 pteval = ((x86pte32_t *)table)[index + i]; 506 if (pteval == 0) 507 break; 508 pa1 = ma_to_pa(pteval & MMU_PAGEMASK); 509 if (pa1 != pa + i * pgsize) 510 break; 511 } 512 if (i > 2) { 513 dboot_printf("%s...\n", tabs + l); 514 va += pgsize * (i - 2); 515 index += i - 2; 516 } 517 next_entry: 518 va += pgsize; 519 if (l == 3 && index == 256) /* VA hole */ 520 va = 0xffff800000000000ull; 521 recursion: 522 ; 523 } 524 if (l < top_level) { 525 ++l; 526 index = save_index[l]; 527 table = save_table[l]; 528 goto recursion; 529 } 530 } 531 532 /* 533 * Add a mapping for the machine page at the given virtual address. 534 */ 535 static void 536 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level) 537 { 538 x86pte_t *ptep; 539 x86pte_t pteval; 540 541 pteval = ma | pte_bits; 542 if (level > 0) 543 pteval |= PT_PAGESIZE; 544 if (va >= target_kernel_text && pge_support) 545 pteval |= PT_GLOBAL; 546 547 if (map_debug && ma != va) 548 dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64 549 " pte=0x%" PRIx64 " l=%d\n", 550 (uint64_t)ma, (uint64_t)va, pteval, level); 551 552 #if defined(__xpv) 553 /* 554 * see if we can avoid find_pte() on the hypervisor 555 */ 556 if (HYPERVISOR_update_va_mapping(va, pteval, 557 UVMF_INVLPG | UVMF_LOCAL) == 0) 558 return; 559 #endif 560 561 /* 562 * Find the pte that will map this address. This creates any 563 * missing intermediate level page tables 564 */ 565 ptep = find_pte(va, NULL, level, 0); 566 567 /* 568 * When paravirtualized, we must use hypervisor calls to modify the 569 * PTE, since paging is active. On real hardware we just write to 570 * the pagetables which aren't in use yet. 571 */ 572 #if defined(__xpv) 573 ptep = ptep; /* shut lint up */ 574 if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL)) 575 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64 576 " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "", 577 (uint64_t)va, level, (uint64_t)ma, pteval); 578 #else 579 if (va < 1024 * 1024) 580 pteval |= PT_NOCACHE; /* for video RAM */ 581 if (pae_support) 582 *ptep = pteval; 583 else 584 *((x86pte32_t *)ptep) = (x86pte32_t)pteval; 585 #endif 586 } 587 588 /* 589 * Add a mapping for the physical page at the given virtual address. 590 */ 591 static void 592 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level) 593 { 594 map_ma_at_va(pa_to_ma(pa), va, level); 595 } 596 597 /* 598 * This is called to remove start..end from the 599 * possible range of PCI addresses. 600 */ 601 const uint64_t pci_lo_limit = 0x00100000ul; 602 const uint64_t pci_hi_limit = 0xfff00000ul; 603 static void 604 exclude_from_pci(uint64_t start, uint64_t end) 605 { 606 int i; 607 int j; 608 struct boot_memlist *ml; 609 610 for (i = 0; i < pcimemlists_used; ++i) { 611 ml = &pcimemlists[i]; 612 613 /* delete the entire range? */ 614 if (start <= ml->addr && ml->addr + ml->size <= end) { 615 --pcimemlists_used; 616 for (j = i; j < pcimemlists_used; ++j) 617 pcimemlists[j] = pcimemlists[j + 1]; 618 --i; /* to revisit the new one at this index */ 619 } 620 621 /* split a range? */ 622 else if (ml->addr < start && end < ml->addr + ml->size) { 623 624 ++pcimemlists_used; 625 if (pcimemlists_used > MAX_MEMLIST) 626 dboot_panic("too many pcimemlists"); 627 628 for (j = pcimemlists_used - 1; j > i; --j) 629 pcimemlists[j] = pcimemlists[j - 1]; 630 ml->size = start - ml->addr; 631 632 ++ml; 633 ml->size = (ml->addr + ml->size) - end; 634 ml->addr = end; 635 ++i; /* skip on to next one */ 636 } 637 638 /* cut memory off the start? */ 639 else if (ml->addr < end && end < ml->addr + ml->size) { 640 ml->size -= end - ml->addr; 641 ml->addr = end; 642 } 643 644 /* cut memory off the end? */ 645 else if (ml->addr <= start && start < ml->addr + ml->size) { 646 ml->size = start - ml->addr; 647 } 648 } 649 } 650 651 /* 652 * During memory allocation, find the highest address not used yet. 653 */ 654 static void 655 check_higher(paddr_t a) 656 { 657 if (a < next_avail_addr) 658 return; 659 next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE); 660 DBG(next_avail_addr); 661 } 662 663 static int 664 dboot_loader_mmap_entries(void) 665 { 666 #if !defined(__xpv) 667 if (num_entries_set == B_TRUE) 668 return (num_entries); 669 670 switch (multiboot_version) { 671 case 1: 672 DBG(mb_info->flags); 673 if (mb_info->flags & 0x40) { 674 mb_memory_map_t *mmap; 675 676 DBG(mb_info->mmap_addr); 677 DBG(mb_info->mmap_length); 678 check_higher(mb_info->mmap_addr + mb_info->mmap_length); 679 680 for (mmap = (mb_memory_map_t *)mb_info->mmap_addr; 681 (uint32_t)mmap < mb_info->mmap_addr + 682 mb_info->mmap_length; 683 mmap = (mb_memory_map_t *)((uint32_t)mmap + 684 mmap->size + sizeof (mmap->size))) 685 ++num_entries; 686 687 num_entries_set = B_TRUE; 688 } 689 break; 690 case 2: 691 num_entries_set = B_TRUE; 692 num_entries = dboot_multiboot2_mmap_nentries(mb2_info, 693 mb2_mmap_tagp); 694 break; 695 default: 696 dboot_panic("Unknown multiboot version: %d\n", 697 multiboot_version); 698 break; 699 } 700 return (num_entries); 701 #else 702 return (MAXMAPS); 703 #endif 704 } 705 706 static uint32_t 707 dboot_loader_mmap_get_type(int index) 708 { 709 #if !defined(__xpv) 710 mb_memory_map_t *mp, *mpend; 711 int i; 712 713 switch (multiboot_version) { 714 case 1: 715 mp = (mb_memory_map_t *)mb_info->mmap_addr; 716 mpend = (mb_memory_map_t *) 717 (mb_info->mmap_addr + mb_info->mmap_length); 718 719 for (i = 0; mp < mpend && i != index; i++) 720 mp = (mb_memory_map_t *)((uint32_t)mp + mp->size + 721 sizeof (mp->size)); 722 if (mp >= mpend) { 723 dboot_panic("dboot_loader_mmap_get_type(): index " 724 "out of bounds: %d\n", index); 725 } 726 return (mp->type); 727 728 case 2: 729 return (dboot_multiboot2_mmap_get_type(mb2_info, 730 mb2_mmap_tagp, index)); 731 732 default: 733 dboot_panic("Unknown multiboot version: %d\n", 734 multiboot_version); 735 break; 736 } 737 return (0); 738 #else 739 return (map_buffer[index].type); 740 #endif 741 } 742 743 static uint64_t 744 dboot_loader_mmap_get_base(int index) 745 { 746 #if !defined(__xpv) 747 mb_memory_map_t *mp, *mpend; 748 int i; 749 750 switch (multiboot_version) { 751 case 1: 752 mp = (mb_memory_map_t *)mb_info->mmap_addr; 753 mpend = (mb_memory_map_t *) 754 (mb_info->mmap_addr + mb_info->mmap_length); 755 756 for (i = 0; mp < mpend && i != index; i++) 757 mp = (mb_memory_map_t *)((uint32_t)mp + mp->size + 758 sizeof (mp->size)); 759 if (mp >= mpend) { 760 dboot_panic("dboot_loader_mmap_get_base(): index " 761 "out of bounds: %d\n", index); 762 } 763 return (((uint64_t)mp->base_addr_high << 32) + 764 (uint64_t)mp->base_addr_low); 765 766 case 2: 767 return (dboot_multiboot2_mmap_get_base(mb2_info, 768 mb2_mmap_tagp, index)); 769 770 default: 771 dboot_panic("Unknown multiboot version: %d\n", 772 multiboot_version); 773 break; 774 } 775 return (0); 776 #else 777 return (((uint64_t)map_buffer[index].base_addr_high << 32) + 778 (uint64_t)map_buffer[index].base_addr_low); 779 #endif 780 } 781 782 static uint64_t 783 dboot_loader_mmap_get_length(int index) 784 { 785 #if !defined(__xpv) 786 mb_memory_map_t *mp, *mpend; 787 int i; 788 789 switch (multiboot_version) { 790 case 1: 791 mp = (mb_memory_map_t *)mb_info->mmap_addr; 792 mpend = (mb_memory_map_t *) 793 (mb_info->mmap_addr + mb_info->mmap_length); 794 795 for (i = 0; mp < mpend && i != index; i++) 796 mp = (mb_memory_map_t *)((uint32_t)mp + mp->size + 797 sizeof (mp->size)); 798 if (mp >= mpend) { 799 dboot_panic("dboot_loader_mmap_get_length(): index " 800 "out of bounds: %d\n", index); 801 } 802 return (((uint64_t)mp->length_high << 32) + 803 (uint64_t)mp->length_low); 804 805 case 2: 806 return (dboot_multiboot2_mmap_get_length(mb2_info, 807 mb2_mmap_tagp, index)); 808 809 default: 810 dboot_panic("Unknown multiboot version: %d\n", 811 multiboot_version); 812 break; 813 } 814 return (0); 815 #else 816 return (((uint64_t)map_buffer[index].length_high << 32) + 817 (uint64_t)map_buffer[index].length_low); 818 #endif 819 } 820 821 static void 822 build_pcimemlists(void) 823 { 824 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */ 825 uint64_t start; 826 uint64_t end; 827 int i, num; 828 829 /* 830 * initialize 831 */ 832 pcimemlists[0].addr = pci_lo_limit; 833 pcimemlists[0].size = pci_hi_limit - pci_lo_limit; 834 pcimemlists_used = 1; 835 836 num = dboot_loader_mmap_entries(); 837 /* 838 * Fill in PCI memlists. 839 */ 840 for (i = 0; i < num; ++i) { 841 start = dboot_loader_mmap_get_base(i); 842 end = start + dboot_loader_mmap_get_length(i); 843 844 if (prom_debug) 845 dboot_printf("\ttype: %d %" PRIx64 "..%" 846 PRIx64 "\n", dboot_loader_mmap_get_type(i), 847 start, end); 848 849 /* 850 * page align start and end 851 */ 852 start = (start + page_offset) & ~page_offset; 853 end &= ~page_offset; 854 if (end <= start) 855 continue; 856 857 exclude_from_pci(start, end); 858 } 859 860 /* 861 * Finish off the pcimemlist 862 */ 863 if (prom_debug) { 864 for (i = 0; i < pcimemlists_used; ++i) { 865 dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%" 866 PRIx64 "\n", pcimemlists[i].addr, 867 pcimemlists[i].addr + pcimemlists[i].size); 868 } 869 } 870 pcimemlists[0].next = 0; 871 pcimemlists[0].prev = 0; 872 for (i = 1; i < pcimemlists_used; ++i) { 873 pcimemlists[i].prev = 874 (native_ptr_t)(uintptr_t)(pcimemlists + i - 1); 875 pcimemlists[i].next = 0; 876 pcimemlists[i - 1].next = 877 (native_ptr_t)(uintptr_t)(pcimemlists + i); 878 } 879 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists; 880 DBG(bi->bi_pcimem); 881 } 882 883 #if defined(__xpv) 884 /* 885 * Initialize memory allocator stuff from hypervisor-supplied start info. 886 */ 887 static void 888 init_mem_alloc(void) 889 { 890 int local; /* variables needed to find start region */ 891 paddr_t scratch_start; 892 xen_memory_map_t map; 893 894 DBG_MSG("Entered init_mem_alloc()\n"); 895 896 /* 897 * Free memory follows the stack. There's at least 512KB of scratch 898 * space, rounded up to at least 2Mb alignment. That should be enough 899 * for the page tables we'll need to build. The nucleus memory is 900 * allocated last and will be outside the addressible range. We'll 901 * switch to new page tables before we unpack the kernel 902 */ 903 scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE); 904 DBG(scratch_start); 905 scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG); 906 DBG(scratch_end); 907 908 /* 909 * For paranoia, leave some space between hypervisor data and ours. 910 * Use 500 instead of 512. 911 */ 912 next_avail_addr = scratch_end - 500 * 1024; 913 DBG(next_avail_addr); 914 915 /* 916 * The domain builder gives us at most 1 module 917 */ 918 DBG(xen_info->mod_len); 919 if (xen_info->mod_len > 0) { 920 DBG(xen_info->mod_start); 921 modules[0].bm_addr = 922 (native_ptr_t)(uintptr_t)xen_info->mod_start; 923 modules[0].bm_size = xen_info->mod_len; 924 bi->bi_module_cnt = 1; 925 bi->bi_modules = (native_ptr_t)(uintptr_t)modules; 926 } else { 927 bi->bi_module_cnt = 0; 928 bi->bi_modules = (native_ptr_t)(uintptr_t)NULL; 929 } 930 DBG(bi->bi_module_cnt); 931 DBG(bi->bi_modules); 932 933 DBG(xen_info->mfn_list); 934 DBG(xen_info->nr_pages); 935 max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT; 936 DBG(max_mem); 937 938 /* 939 * Using pseudo-physical addresses, so only 1 memlist element 940 */ 941 memlists[0].addr = 0; 942 DBG(memlists[0].addr); 943 memlists[0].size = max_mem; 944 DBG(memlists[0].size); 945 memlists_used = 1; 946 DBG(memlists_used); 947 948 /* 949 * finish building physinstall list 950 */ 951 sort_physinstall(); 952 953 /* 954 * build bios reserved memlists 955 */ 956 build_rsvdmemlists(); 957 958 if (DOMAIN_IS_INITDOMAIN(xen_info)) { 959 /* 960 * build PCI Memory list 961 */ 962 map.nr_entries = MAXMAPS; 963 /*LINTED: constant in conditional context*/ 964 set_xen_guest_handle(map.buffer, map_buffer); 965 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0) 966 dboot_panic("getting XENMEM_machine_memory_map failed"); 967 build_pcimemlists(); 968 } 969 } 970 971 #else /* !__xpv */ 972 973 static void 974 dboot_multiboot1_xboot_consinfo(void) 975 { 976 bi->bi_framebuffer = NULL; 977 } 978 979 static void 980 dboot_multiboot2_xboot_consinfo(void) 981 { 982 multiboot_tag_framebuffer_t *fb; 983 fb = dboot_multiboot2_find_tag(mb2_info, 984 MULTIBOOT_TAG_TYPE_FRAMEBUFFER); 985 bi->bi_framebuffer = (native_ptr_t)(uintptr_t)fb; 986 } 987 988 static int 989 dboot_multiboot_modcount(void) 990 { 991 switch (multiboot_version) { 992 case 1: 993 return (mb_info->mods_count); 994 995 case 2: 996 return (dboot_multiboot2_modcount(mb2_info)); 997 998 default: 999 dboot_panic("Unknown multiboot version: %d\n", 1000 multiboot_version); 1001 break; 1002 } 1003 return (0); 1004 } 1005 1006 static uint32_t 1007 dboot_multiboot_modstart(int index) 1008 { 1009 switch (multiboot_version) { 1010 case 1: 1011 return (((mb_module_t *)mb_info->mods_addr)[index].mod_start); 1012 1013 case 2: 1014 return (dboot_multiboot2_modstart(mb2_info, index)); 1015 1016 default: 1017 dboot_panic("Unknown multiboot version: %d\n", 1018 multiboot_version); 1019 break; 1020 } 1021 return (0); 1022 } 1023 1024 static uint32_t 1025 dboot_multiboot_modend(int index) 1026 { 1027 switch (multiboot_version) { 1028 case 1: 1029 return (((mb_module_t *)mb_info->mods_addr)[index].mod_end); 1030 1031 case 2: 1032 return (dboot_multiboot2_modend(mb2_info, index)); 1033 1034 default: 1035 dboot_panic("Unknown multiboot version: %d\n", 1036 multiboot_version); 1037 break; 1038 } 1039 return (0); 1040 } 1041 1042 static char * 1043 dboot_multiboot_modcmdline(int index) 1044 { 1045 switch (multiboot_version) { 1046 case 1: 1047 return ((char *)((mb_module_t *) 1048 mb_info->mods_addr)[index].mod_name); 1049 1050 case 2: 1051 return (dboot_multiboot2_modcmdline(mb2_info, index)); 1052 1053 default: 1054 dboot_panic("Unknown multiboot version: %d\n", 1055 multiboot_version); 1056 break; 1057 } 1058 return (0); 1059 } 1060 1061 /* 1062 * Find the environment module for console setup. 1063 * Since we need the console to print early boot messages, the console is set up 1064 * before anything else and therefore we need to pick up the environment module 1065 * early too. 1066 * 1067 * Note, we just will search for and if found, will pass the env 1068 * module to console setup, the proper module list processing will happen later. 1069 */ 1070 static void 1071 dboot_find_env(void) 1072 { 1073 int i, modcount; 1074 uint32_t mod_start, mod_end; 1075 char *cmdline; 1076 1077 modcount = dboot_multiboot_modcount(); 1078 1079 for (i = 0; i < modcount; ++i) { 1080 cmdline = dboot_multiboot_modcmdline(i); 1081 if (cmdline == NULL) 1082 continue; 1083 1084 if (strstr(cmdline, "type=environment") == NULL) 1085 continue; 1086 1087 mod_start = dboot_multiboot_modstart(i); 1088 mod_end = dboot_multiboot_modend(i); 1089 modules[0].bm_addr = (native_ptr_t)(uintptr_t)mod_start; 1090 modules[0].bm_size = mod_end - mod_start; 1091 modules[0].bm_name = (native_ptr_t)(uintptr_t)NULL; 1092 modules[0].bm_hash = (native_ptr_t)(uintptr_t)NULL; 1093 modules[0].bm_type = BMT_ENV; 1094 bi->bi_modules = (native_ptr_t)(uintptr_t)modules; 1095 bi->bi_module_cnt = 1; 1096 return; 1097 } 1098 } 1099 1100 static boolean_t 1101 dboot_multiboot_basicmeminfo(uint32_t *lower, uint32_t *upper) 1102 { 1103 boolean_t rv = B_FALSE; 1104 1105 switch (multiboot_version) { 1106 case 1: 1107 if (mb_info->flags & 0x01) { 1108 *lower = mb_info->mem_lower; 1109 *upper = mb_info->mem_upper; 1110 rv = B_TRUE; 1111 } 1112 break; 1113 1114 case 2: 1115 return (dboot_multiboot2_basicmeminfo(mb2_info, lower, upper)); 1116 1117 default: 1118 dboot_panic("Unknown multiboot version: %d\n", 1119 multiboot_version); 1120 break; 1121 } 1122 return (rv); 1123 } 1124 1125 static uint8_t 1126 dboot_a2h(char v) 1127 { 1128 if (v >= 'a') 1129 return (v - 'a' + 0xa); 1130 else if (v >= 'A') 1131 return (v - 'A' + 0xa); 1132 else if (v >= '0') 1133 return (v - '0'); 1134 else 1135 dboot_panic("bad ASCII hex character %c\n", v); 1136 1137 return (0); 1138 } 1139 1140 static void 1141 digest_a2h(const char *ascii, uint8_t *digest) 1142 { 1143 unsigned int i; 1144 1145 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) { 1146 digest[i] = dboot_a2h(ascii[i * 2]) << 4; 1147 digest[i] |= dboot_a2h(ascii[i * 2 + 1]); 1148 } 1149 } 1150 1151 /* 1152 * Generate a SHA-1 hash of the first len bytes of image, and compare it with 1153 * the ASCII-format hash found in the 40-byte buffer at ascii. If they 1154 * match, return 0, otherwise -1. This works only for images smaller than 1155 * 4 GB, which should not be a problem. 1156 */ 1157 static int 1158 check_image_hash(uint_t midx) 1159 { 1160 const char *ascii; 1161 const void *image; 1162 size_t len; 1163 SHA1_CTX ctx; 1164 uint8_t digest[SHA1_DIGEST_LENGTH]; 1165 uint8_t baseline[SHA1_DIGEST_LENGTH]; 1166 unsigned int i; 1167 1168 ascii = (const char *)(uintptr_t)modules[midx].bm_hash; 1169 image = (const void *)(uintptr_t)modules[midx].bm_addr; 1170 len = (size_t)modules[midx].bm_size; 1171 1172 digest_a2h(ascii, baseline); 1173 1174 SHA1Init(&ctx); 1175 SHA1Update(&ctx, image, len); 1176 SHA1Final(digest, &ctx); 1177 1178 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) { 1179 if (digest[i] != baseline[i]) 1180 return (-1); 1181 } 1182 1183 return (0); 1184 } 1185 1186 static const char * 1187 type_to_str(boot_module_type_t type) 1188 { 1189 switch (type) { 1190 case BMT_ROOTFS: 1191 return ("rootfs"); 1192 case BMT_FILE: 1193 return ("file"); 1194 case BMT_HASH: 1195 return ("hash"); 1196 case BMT_ENV: 1197 return ("environment"); 1198 default: 1199 return ("unknown"); 1200 } 1201 } 1202 1203 static void 1204 check_images(void) 1205 { 1206 uint_t i; 1207 char displayhash[SHA1_ASCII_LENGTH + 1]; 1208 1209 for (i = 0; i < modules_used; i++) { 1210 if (prom_debug) { 1211 dboot_printf("module #%d: name %s type %s " 1212 "addr %lx size %lx\n", 1213 i, (char *)(uintptr_t)modules[i].bm_name, 1214 type_to_str(modules[i].bm_type), 1215 (ulong_t)modules[i].bm_addr, 1216 (ulong_t)modules[i].bm_size); 1217 } 1218 1219 if (modules[i].bm_type == BMT_HASH || 1220 modules[i].bm_hash == (native_ptr_t)(uintptr_t)NULL) { 1221 DBG_MSG("module has no hash; skipping check\n"); 1222 continue; 1223 } 1224 (void) memcpy(displayhash, 1225 (void *)(uintptr_t)modules[i].bm_hash, 1226 SHA1_ASCII_LENGTH); 1227 displayhash[SHA1_ASCII_LENGTH] = '\0'; 1228 if (prom_debug) { 1229 dboot_printf("checking expected hash [%s]: ", 1230 displayhash); 1231 } 1232 1233 if (check_image_hash(i) != 0) 1234 dboot_panic("hash mismatch!\n"); 1235 else 1236 DBG_MSG("OK\n"); 1237 } 1238 } 1239 1240 /* 1241 * Determine the module's starting address, size, name, and type, and fill the 1242 * boot_modules structure. This structure is used by the bop code, except for 1243 * hashes which are checked prior to transferring control to the kernel. 1244 */ 1245 static void 1246 process_module(int midx) 1247 { 1248 uint32_t mod_start = dboot_multiboot_modstart(midx); 1249 uint32_t mod_end = dboot_multiboot_modend(midx); 1250 char *cmdline = dboot_multiboot_modcmdline(midx); 1251 char *p, *q; 1252 1253 check_higher(mod_end); 1254 if (prom_debug) { 1255 dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n", 1256 midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end); 1257 } 1258 1259 if (mod_start > mod_end) { 1260 dboot_panic("module #%d: module start address 0x%lx greater " 1261 "than end address 0x%lx", midx, 1262 (ulong_t)mod_start, (ulong_t)mod_end); 1263 } 1264 1265 /* 1266 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes 1267 * the address of the last valid byte in a module plus 1 as mod_end. 1268 * This is of course a bug; the multiboot specification simply states 1269 * that mod_start and mod_end "contain the start and end addresses of 1270 * the boot module itself" which is pretty obviously not what GRUB is 1271 * doing. However, fixing it requires that not only this code be 1272 * changed but also that other code consuming this value and values 1273 * derived from it be fixed, and that the kernel and GRUB must either 1274 * both have the bug or neither. While there are a lot of combinations 1275 * that will work, there are also some that won't, so for simplicity 1276 * we'll just cope with the bug. That means we won't actually hash the 1277 * byte at mod_end, and we will expect that mod_end for the hash file 1278 * itself is one greater than some multiple of 41 (40 bytes of ASCII 1279 * hash plus a newline for each module). We set bm_size to the true 1280 * correct number of bytes in each module, achieving exactly this. 1281 */ 1282 1283 modules[midx].bm_addr = (native_ptr_t)(uintptr_t)mod_start; 1284 modules[midx].bm_size = mod_end - mod_start; 1285 modules[midx].bm_name = (native_ptr_t)(uintptr_t)cmdline; 1286 modules[midx].bm_hash = (native_ptr_t)(uintptr_t)NULL; 1287 modules[midx].bm_type = BMT_FILE; 1288 1289 if (cmdline == NULL) { 1290 modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname; 1291 return; 1292 } 1293 1294 p = cmdline; 1295 modules[midx].bm_name = 1296 (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r"); 1297 1298 while (p != NULL) { 1299 q = strsep(&p, " \t\f\n\r"); 1300 if (strncmp(q, "name=", 5) == 0) { 1301 if (q[5] != '\0' && !isspace(q[5])) { 1302 modules[midx].bm_name = 1303 (native_ptr_t)(uintptr_t)(q + 5); 1304 } 1305 continue; 1306 } 1307 1308 if (strncmp(q, "type=", 5) == 0) { 1309 if (q[5] == '\0' || isspace(q[5])) 1310 continue; 1311 q += 5; 1312 if (strcmp(q, "rootfs") == 0) { 1313 modules[midx].bm_type = BMT_ROOTFS; 1314 } else if (strcmp(q, "hash") == 0) { 1315 modules[midx].bm_type = BMT_HASH; 1316 } else if (strcmp(q, "environment") == 0) { 1317 modules[midx].bm_type = BMT_ENV; 1318 } else if (strcmp(q, "file") != 0) { 1319 dboot_printf("\tmodule #%d: unknown module " 1320 "type '%s'; defaulting to 'file'", 1321 midx, q); 1322 } 1323 continue; 1324 } 1325 1326 if (strncmp(q, "hash=", 5) == 0) { 1327 if (q[5] != '\0' && !isspace(q[5])) { 1328 modules[midx].bm_hash = 1329 (native_ptr_t)(uintptr_t)(q + 5); 1330 } 1331 continue; 1332 } 1333 1334 dboot_printf("ignoring unknown option '%s'\n", q); 1335 } 1336 } 1337 1338 /* 1339 * Backward compatibility: if there are exactly one or two modules, both 1340 * of type 'file' and neither with an embedded hash value, we have been 1341 * given the legacy style modules. In this case we need to treat the first 1342 * module as a rootfs and the second as a hash referencing that module. 1343 * Otherwise, even if the configuration is invalid, we assume that the 1344 * operator knows what he's doing or at least isn't being bitten by this 1345 * interface change. 1346 */ 1347 static void 1348 fixup_modules(void) 1349 { 1350 if (modules_used == 0 || modules_used > 2) 1351 return; 1352 1353 if (modules[0].bm_type != BMT_FILE || 1354 modules_used > 1 && modules[1].bm_type != BMT_FILE) { 1355 return; 1356 } 1357 1358 if (modules[0].bm_hash != (native_ptr_t)(uintptr_t)NULL || 1359 modules_used > 1 && 1360 modules[1].bm_hash != (native_ptr_t)(uintptr_t)NULL) { 1361 return; 1362 } 1363 1364 modules[0].bm_type = BMT_ROOTFS; 1365 if (modules_used > 1) { 1366 modules[1].bm_type = BMT_HASH; 1367 modules[1].bm_name = modules[0].bm_name; 1368 } 1369 } 1370 1371 /* 1372 * For modules that do not have assigned hashes but have a separate hash module, 1373 * find the assigned hash module and set the primary module's bm_hash to point 1374 * to the hash data from that module. We will then ignore modules of type 1375 * BMT_HASH from this point forward. 1376 */ 1377 static void 1378 assign_module_hashes(void) 1379 { 1380 uint_t i, j; 1381 1382 for (i = 0; i < modules_used; i++) { 1383 if (modules[i].bm_type == BMT_HASH || 1384 modules[i].bm_hash != (native_ptr_t)(uintptr_t)NULL) { 1385 continue; 1386 } 1387 1388 for (j = 0; j < modules_used; j++) { 1389 if (modules[j].bm_type != BMT_HASH || 1390 strcmp((char *)(uintptr_t)modules[j].bm_name, 1391 (char *)(uintptr_t)modules[i].bm_name) != 0) { 1392 continue; 1393 } 1394 1395 if (modules[j].bm_size < SHA1_ASCII_LENGTH) { 1396 dboot_printf("Short hash module of length " 1397 "0x%lx bytes; ignoring\n", 1398 (ulong_t)modules[j].bm_size); 1399 } else { 1400 modules[i].bm_hash = modules[j].bm_addr; 1401 } 1402 break; 1403 } 1404 } 1405 } 1406 1407 /* 1408 * Walk through the module information finding the last used address. 1409 * The first available address will become the top level page table. 1410 */ 1411 static void 1412 dboot_process_modules(void) 1413 { 1414 int i, modcount; 1415 extern char _end[]; 1416 1417 DBG_MSG("\nFinding Modules\n"); 1418 modcount = dboot_multiboot_modcount(); 1419 if (modcount > MAX_BOOT_MODULES) { 1420 dboot_panic("Too many modules (%d) -- the maximum is %d.", 1421 modcount, MAX_BOOT_MODULES); 1422 } 1423 /* 1424 * search the modules to find the last used address 1425 * we'll build the module list while we're walking through here 1426 */ 1427 check_higher((paddr_t)(uintptr_t)&_end); 1428 for (i = 0; i < modcount; ++i) { 1429 process_module(i); 1430 modules_used++; 1431 } 1432 bi->bi_modules = (native_ptr_t)(uintptr_t)modules; 1433 DBG(bi->bi_modules); 1434 bi->bi_module_cnt = modcount; 1435 DBG(bi->bi_module_cnt); 1436 1437 fixup_modules(); 1438 assign_module_hashes(); 1439 check_images(); 1440 } 1441 1442 /* 1443 * We then build the phys_install memlist from the multiboot information. 1444 */ 1445 static void 1446 dboot_process_mmap(void) 1447 { 1448 uint64_t start; 1449 uint64_t end; 1450 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */ 1451 uint32_t lower, upper; 1452 int i, mmap_entries; 1453 1454 /* 1455 * Walk through the memory map from multiboot and build our memlist 1456 * structures. Note these will have native format pointers. 1457 */ 1458 DBG_MSG("\nFinding Memory Map\n"); 1459 num_entries = 0; 1460 num_entries_set = B_FALSE; 1461 max_mem = 0; 1462 if ((mmap_entries = dboot_loader_mmap_entries()) > 0) { 1463 for (i = 0; i < mmap_entries; i++) { 1464 uint32_t type = dboot_loader_mmap_get_type(i); 1465 start = dboot_loader_mmap_get_base(i); 1466 end = start + dboot_loader_mmap_get_length(i); 1467 1468 if (prom_debug) 1469 dboot_printf("\ttype: %d %" PRIx64 "..%" 1470 PRIx64 "\n", type, start, end); 1471 1472 /* 1473 * page align start and end 1474 */ 1475 start = (start + page_offset) & ~page_offset; 1476 end &= ~page_offset; 1477 if (end <= start) 1478 continue; 1479 1480 /* 1481 * only type 1 is usable RAM 1482 */ 1483 switch (type) { 1484 case 1: 1485 if (end > max_mem) 1486 max_mem = end; 1487 memlists[memlists_used].addr = start; 1488 memlists[memlists_used].size = end - start; 1489 ++memlists_used; 1490 if (memlists_used > MAX_MEMLIST) 1491 dboot_panic("too many memlists"); 1492 break; 1493 case 2: 1494 rsvdmemlists[rsvdmemlists_used].addr = start; 1495 rsvdmemlists[rsvdmemlists_used].size = 1496 end - start; 1497 ++rsvdmemlists_used; 1498 if (rsvdmemlists_used > MAX_MEMLIST) 1499 dboot_panic("too many rsvdmemlists"); 1500 break; 1501 default: 1502 continue; 1503 } 1504 } 1505 build_pcimemlists(); 1506 } else if (dboot_multiboot_basicmeminfo(&lower, &upper)) { 1507 DBG(lower); 1508 memlists[memlists_used].addr = 0; 1509 memlists[memlists_used].size = lower * 1024; 1510 ++memlists_used; 1511 DBG(upper); 1512 memlists[memlists_used].addr = 1024 * 1024; 1513 memlists[memlists_used].size = upper * 1024; 1514 ++memlists_used; 1515 1516 /* 1517 * Old platform - assume I/O space at the end of memory. 1518 */ 1519 pcimemlists[0].addr = (upper * 1024) + (1024 * 1024); 1520 pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr; 1521 pcimemlists[0].next = 0; 1522 pcimemlists[0].prev = 0; 1523 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists; 1524 DBG(bi->bi_pcimem); 1525 } else { 1526 dboot_panic("No memory info from boot loader!!!"); 1527 } 1528 1529 /* 1530 * finish processing the physinstall list 1531 */ 1532 sort_physinstall(); 1533 1534 /* 1535 * build bios reserved mem lists 1536 */ 1537 build_rsvdmemlists(); 1538 } 1539 1540 /* 1541 * The highest address is used as the starting point for dboot's simple 1542 * memory allocator. 1543 * 1544 * Finding the highest address in case of Multiboot 1 protocol is 1545 * quite painful in the sense that some information provided by 1546 * the multiboot info structure points to BIOS data, and some to RAM. 1547 * 1548 * The module list was processed and checked already by dboot_process_modules(), 1549 * so we will check the command line string and the memory map. 1550 * 1551 * This list of to be checked items is based on our current knowledge of 1552 * allocations made by grub1 and will need to be reviewed if there 1553 * are updates about the information provided by Multiboot 1. 1554 * 1555 * In the case of the Multiboot 2, our life is much simpler, as the MB2 1556 * information tag list is one contiguous chunk of memory. 1557 */ 1558 static paddr_t 1559 dboot_multiboot1_highest_addr(void) 1560 { 1561 paddr_t addr = (paddr_t)(uintptr_t)NULL; 1562 char *cmdl = (char *)mb_info->cmdline; 1563 1564 if (mb_info->flags & MB_INFO_CMDLINE) 1565 addr = ((paddr_t)((uintptr_t)cmdl + strlen(cmdl) + 1)); 1566 1567 if (mb_info->flags & MB_INFO_MEM_MAP) 1568 addr = MAX(addr, 1569 ((paddr_t)(mb_info->mmap_addr + mb_info->mmap_length))); 1570 return (addr); 1571 } 1572 1573 static void 1574 dboot_multiboot_highest_addr(void) 1575 { 1576 paddr_t addr; 1577 1578 switch (multiboot_version) { 1579 case 1: 1580 addr = dboot_multiboot1_highest_addr(); 1581 if (addr != (paddr_t)(uintptr_t)NULL) 1582 check_higher(addr); 1583 break; 1584 case 2: 1585 addr = dboot_multiboot2_highest_addr(mb2_info); 1586 if (addr != (paddr_t)(uintptr_t)NULL) 1587 check_higher(addr); 1588 break; 1589 default: 1590 dboot_panic("Unknown multiboot version: %d\n", 1591 multiboot_version); 1592 break; 1593 } 1594 } 1595 1596 /* 1597 * Walk the boot loader provided information and find the highest free address. 1598 */ 1599 static void 1600 init_mem_alloc(void) 1601 { 1602 DBG_MSG("Entered init_mem_alloc()\n"); 1603 dboot_process_modules(); 1604 dboot_process_mmap(); 1605 dboot_multiboot_highest_addr(); 1606 } 1607 1608 static int 1609 dboot_same_guids(efi_guid_t *g1, efi_guid_t *g2) 1610 { 1611 int i; 1612 1613 if (g1->time_low != g2->time_low) 1614 return (0); 1615 if (g1->time_mid != g2->time_mid) 1616 return (0); 1617 if (g1->time_hi_and_version != g2->time_hi_and_version) 1618 return (0); 1619 if (g1->clock_seq_hi_and_reserved != g2->clock_seq_hi_and_reserved) 1620 return (0); 1621 if (g1->clock_seq_low != g2->clock_seq_low) 1622 return (0); 1623 1624 for (i = 0; i < 6; i++) { 1625 if (g1->node_addr[i] != g2->node_addr[i]) 1626 return (0); 1627 } 1628 return (1); 1629 } 1630 1631 static void 1632 process_efi32(EFI_SYSTEM_TABLE32 *efi) 1633 { 1634 uint32_t entries; 1635 EFI_CONFIGURATION_TABLE32 *config; 1636 int i; 1637 1638 entries = efi->NumberOfTableEntries; 1639 config = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t) 1640 efi->ConfigurationTable; 1641 1642 for (i = 0; i < entries; i++) { 1643 if (dboot_same_guids(&config[i].VendorGuid, &smbios3)) { 1644 bi->bi_smbios = (native_ptr_t)(uintptr_t) 1645 config[i].VendorTable; 1646 } 1647 if (bi->bi_smbios == NULL && 1648 dboot_same_guids(&config[i].VendorGuid, &smbios)) { 1649 bi->bi_smbios = (native_ptr_t)(uintptr_t) 1650 config[i].VendorTable; 1651 } 1652 if (dboot_same_guids(&config[i].VendorGuid, &acpi2)) { 1653 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t) 1654 config[i].VendorTable; 1655 } 1656 if (bi->bi_acpi_rsdp == NULL && 1657 dboot_same_guids(&config[i].VendorGuid, &acpi1)) { 1658 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t) 1659 config[i].VendorTable; 1660 } 1661 } 1662 } 1663 1664 static void 1665 process_efi64(EFI_SYSTEM_TABLE64 *efi) 1666 { 1667 uint64_t entries; 1668 EFI_CONFIGURATION_TABLE64 *config; 1669 int i; 1670 1671 entries = efi->NumberOfTableEntries; 1672 config = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t) 1673 efi->ConfigurationTable; 1674 1675 for (i = 0; i < entries; i++) { 1676 if (dboot_same_guids(&config[i].VendorGuid, &smbios3)) { 1677 bi->bi_smbios = (native_ptr_t)(uintptr_t) 1678 config[i].VendorTable; 1679 } 1680 if (bi->bi_smbios == NULL && 1681 dboot_same_guids(&config[i].VendorGuid, &smbios)) { 1682 bi->bi_smbios = (native_ptr_t)(uintptr_t) 1683 config[i].VendorTable; 1684 } 1685 /* Prefer acpi v2+ over v1. */ 1686 if (dboot_same_guids(&config[i].VendorGuid, &acpi2)) { 1687 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t) 1688 config[i].VendorTable; 1689 } 1690 if (bi->bi_acpi_rsdp == NULL && 1691 dboot_same_guids(&config[i].VendorGuid, &acpi1)) { 1692 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t) 1693 config[i].VendorTable; 1694 } 1695 } 1696 } 1697 1698 static void 1699 dboot_multiboot_get_fwtables(void) 1700 { 1701 multiboot_tag_new_acpi_t *nacpitagp; 1702 multiboot_tag_old_acpi_t *oacpitagp; 1703 multiboot_tag_efi64_t *efi64tagp = NULL; 1704 multiboot_tag_efi32_t *efi32tagp = NULL; 1705 1706 /* no fw tables from multiboot 1 */ 1707 if (multiboot_version != 2) 1708 return; 1709 1710 efi64tagp = (multiboot_tag_efi64_t *) 1711 dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_EFI64); 1712 if (efi64tagp != NULL) { 1713 bi->bi_uefi_arch = XBI_UEFI_ARCH_64; 1714 bi->bi_uefi_systab = (native_ptr_t)(uintptr_t) 1715 efi64tagp->mb_pointer; 1716 process_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t) 1717 efi64tagp->mb_pointer); 1718 } else { 1719 efi32tagp = (multiboot_tag_efi32_t *) 1720 dboot_multiboot2_find_tag(mb2_info, 1721 MULTIBOOT_TAG_TYPE_EFI32); 1722 if (efi32tagp != NULL) { 1723 bi->bi_uefi_arch = XBI_UEFI_ARCH_32; 1724 bi->bi_uefi_systab = (native_ptr_t)(uintptr_t) 1725 efi32tagp->mb_pointer; 1726 process_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t) 1727 efi32tagp->mb_pointer); 1728 } 1729 } 1730 1731 /* 1732 * The ACPI RSDP can be found by scanning the BIOS memory areas or 1733 * from the EFI system table. The boot loader may pass in the address 1734 * it found the ACPI tables at. 1735 */ 1736 nacpitagp = (multiboot_tag_new_acpi_t *) 1737 dboot_multiboot2_find_tag(mb2_info, 1738 MULTIBOOT_TAG_TYPE_ACPI_NEW); 1739 oacpitagp = (multiboot_tag_old_acpi_t *) 1740 dboot_multiboot2_find_tag(mb2_info, 1741 MULTIBOOT_TAG_TYPE_ACPI_OLD); 1742 1743 if (nacpitagp != NULL) { 1744 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t) 1745 &nacpitagp->mb_rsdp[0]; 1746 } else if (oacpitagp != NULL) { 1747 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t) 1748 &oacpitagp->mb_rsdp[0]; 1749 } 1750 } 1751 1752 /* print out EFI version string with newline */ 1753 static void 1754 dboot_print_efi_version(uint32_t ver) 1755 { 1756 int rev; 1757 1758 dboot_printf("%d.", EFI_REV_MAJOR(ver)); 1759 1760 rev = EFI_REV_MINOR(ver); 1761 if ((rev % 10) != 0) { 1762 dboot_printf("%d.%d\n", rev / 10, rev % 10); 1763 } else { 1764 dboot_printf("%d\n", rev / 10); 1765 } 1766 } 1767 1768 static void 1769 print_efi32(EFI_SYSTEM_TABLE32 *efi) 1770 { 1771 uint16_t *data; 1772 EFI_CONFIGURATION_TABLE32 *conf; 1773 int i; 1774 1775 dboot_printf("EFI32 signature: %llx\n", 1776 (unsigned long long)efi->Hdr.Signature); 1777 dboot_printf("EFI system version: "); 1778 dboot_print_efi_version(efi->Hdr.Revision); 1779 dboot_printf("EFI system vendor: "); 1780 data = (uint16_t *)(uintptr_t)efi->FirmwareVendor; 1781 for (i = 0; data[i] != 0; i++) 1782 dboot_printf("%c", (char)data[i]); 1783 dboot_printf("\nEFI firmware revision: "); 1784 dboot_print_efi_version(efi->FirmwareRevision); 1785 dboot_printf("EFI system table number of entries: %d\n", 1786 efi->NumberOfTableEntries); 1787 conf = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t) 1788 efi->ConfigurationTable; 1789 for (i = 0; i < (int)efi->NumberOfTableEntries; i++) { 1790 dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i, 1791 conf[i].VendorGuid.time_low, 1792 conf[i].VendorGuid.time_mid, 1793 conf[i].VendorGuid.time_hi_and_version, 1794 conf[i].VendorGuid.clock_seq_hi_and_reserved, 1795 conf[i].VendorGuid.clock_seq_low); 1796 dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n", 1797 conf[i].VendorGuid.node_addr[0], 1798 conf[i].VendorGuid.node_addr[1], 1799 conf[i].VendorGuid.node_addr[2], 1800 conf[i].VendorGuid.node_addr[3], 1801 conf[i].VendorGuid.node_addr[4], 1802 conf[i].VendorGuid.node_addr[5]); 1803 } 1804 } 1805 1806 static void 1807 print_efi64(EFI_SYSTEM_TABLE64 *efi) 1808 { 1809 uint16_t *data; 1810 EFI_CONFIGURATION_TABLE64 *conf; 1811 int i; 1812 1813 dboot_printf("EFI64 signature: %llx\n", 1814 (unsigned long long)efi->Hdr.Signature); 1815 dboot_printf("EFI system version: "); 1816 dboot_print_efi_version(efi->Hdr.Revision); 1817 dboot_printf("EFI system vendor: "); 1818 data = (uint16_t *)(uintptr_t)efi->FirmwareVendor; 1819 for (i = 0; data[i] != 0; i++) 1820 dboot_printf("%c", (char)data[i]); 1821 dboot_printf("\nEFI firmware revision: "); 1822 dboot_print_efi_version(efi->FirmwareRevision); 1823 dboot_printf("EFI system table number of entries: %lld\n", 1824 efi->NumberOfTableEntries); 1825 conf = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t) 1826 efi->ConfigurationTable; 1827 for (i = 0; i < (int)efi->NumberOfTableEntries; i++) { 1828 dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i, 1829 conf[i].VendorGuid.time_low, 1830 conf[i].VendorGuid.time_mid, 1831 conf[i].VendorGuid.time_hi_and_version, 1832 conf[i].VendorGuid.clock_seq_hi_and_reserved, 1833 conf[i].VendorGuid.clock_seq_low); 1834 dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n", 1835 conf[i].VendorGuid.node_addr[0], 1836 conf[i].VendorGuid.node_addr[1], 1837 conf[i].VendorGuid.node_addr[2], 1838 conf[i].VendorGuid.node_addr[3], 1839 conf[i].VendorGuid.node_addr[4], 1840 conf[i].VendorGuid.node_addr[5]); 1841 } 1842 } 1843 #endif /* !__xpv */ 1844 1845 /* 1846 * Simple memory allocator, allocates aligned physical memory. 1847 * Note that startup_kernel() only allocates memory, never frees. 1848 * Memory usage just grows in an upward direction. 1849 */ 1850 static void * 1851 do_mem_alloc(uint32_t size, uint32_t align) 1852 { 1853 uint_t i; 1854 uint64_t best; 1855 uint64_t start; 1856 uint64_t end; 1857 1858 /* 1859 * make sure size is a multiple of pagesize 1860 */ 1861 size = RNDUP(size, MMU_PAGESIZE); 1862 next_avail_addr = RNDUP(next_avail_addr, align); 1863 1864 /* 1865 * XXPV fixme joe 1866 * 1867 * a really large bootarchive that causes you to run out of memory 1868 * may cause this to blow up 1869 */ 1870 /* LINTED E_UNEXPECTED_UINT_PROMOTION */ 1871 best = (uint64_t)-size; 1872 for (i = 0; i < memlists_used; ++i) { 1873 start = memlists[i].addr; 1874 #if defined(__xpv) 1875 start += mfn_base; 1876 #endif 1877 end = start + memlists[i].size; 1878 1879 /* 1880 * did we find the desired address? 1881 */ 1882 if (start <= next_avail_addr && next_avail_addr + size <= end) { 1883 best = next_avail_addr; 1884 goto done; 1885 } 1886 1887 /* 1888 * if not is this address the best so far? 1889 */ 1890 if (start > next_avail_addr && start < best && 1891 RNDUP(start, align) + size <= end) 1892 best = RNDUP(start, align); 1893 } 1894 1895 /* 1896 * We didn't find exactly the address we wanted, due to going off the 1897 * end of a memory region. Return the best found memory address. 1898 */ 1899 done: 1900 next_avail_addr = best + size; 1901 #if defined(__xpv) 1902 if (next_avail_addr > scratch_end) 1903 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: " 1904 "0x%lx", (ulong_t)next_avail_addr, 1905 (ulong_t)scratch_end); 1906 #endif 1907 (void) memset((void *)(uintptr_t)best, 0, size); 1908 return ((void *)(uintptr_t)best); 1909 } 1910 1911 void * 1912 mem_alloc(uint32_t size) 1913 { 1914 return (do_mem_alloc(size, MMU_PAGESIZE)); 1915 } 1916 1917 1918 /* 1919 * Build page tables to map all of memory used so far as well as the kernel. 1920 */ 1921 static void 1922 build_page_tables(void) 1923 { 1924 uint32_t psize; 1925 uint32_t level; 1926 uint32_t off; 1927 uint64_t start; 1928 #if !defined(__xpv) 1929 uint32_t i; 1930 uint64_t end; 1931 #endif /* __xpv */ 1932 1933 /* 1934 * If we're on metal, we need to create the top level pagetable. 1935 */ 1936 #if defined(__xpv) 1937 top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base; 1938 #else /* __xpv */ 1939 top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); 1940 #endif /* __xpv */ 1941 DBG((uintptr_t)top_page_table); 1942 1943 /* 1944 * Determine if we'll use large mappings for kernel, then map it. 1945 */ 1946 if (largepage_support) { 1947 psize = lpagesize; 1948 level = 1; 1949 } else { 1950 psize = MMU_PAGESIZE; 1951 level = 0; 1952 } 1953 1954 DBG_MSG("Mapping kernel\n"); 1955 DBG(ktext_phys); 1956 DBG(target_kernel_text); 1957 DBG(ksize); 1958 DBG(psize); 1959 for (off = 0; off < ksize; off += psize) 1960 map_pa_at_va(ktext_phys + off, target_kernel_text + off, level); 1961 1962 /* 1963 * The kernel will need a 1 page window to work with page tables 1964 */ 1965 bi->bi_pt_window = (native_ptr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); 1966 DBG(bi->bi_pt_window); 1967 bi->bi_pte_to_pt_window = 1968 (native_ptr_t)(uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0); 1969 DBG(bi->bi_pte_to_pt_window); 1970 1971 #if defined(__xpv) 1972 if (!DOMAIN_IS_INITDOMAIN(xen_info)) { 1973 /* If this is a domU we're done. */ 1974 DBG_MSG("\nPage tables constructed\n"); 1975 return; 1976 } 1977 #endif /* __xpv */ 1978 1979 /* 1980 * We need 1:1 mappings for the lower 1M of memory to access 1981 * BIOS tables used by a couple of drivers during boot. 1982 * 1983 * The following code works because our simple memory allocator 1984 * only grows usage in an upwards direction. 1985 * 1986 * Note that by this point in boot some mappings for low memory 1987 * may already exist because we've already accessed device in low 1988 * memory. (Specifically the video frame buffer and keyboard 1989 * status ports.) If we're booting on raw hardware then GRUB 1990 * created these mappings for us. If we're booting under a 1991 * hypervisor then we went ahead and remapped these devices into 1992 * memory allocated within dboot itself. 1993 */ 1994 if (map_debug) 1995 dboot_printf("1:1 map pa=0..1Meg\n"); 1996 for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) { 1997 #if defined(__xpv) 1998 map_ma_at_va(start, start, 0); 1999 #else /* __xpv */ 2000 map_pa_at_va(start, start, 0); 2001 #endif /* __xpv */ 2002 } 2003 2004 #if !defined(__xpv) 2005 2006 for (i = 0; i < memlists_used; ++i) { 2007 start = memlists[i].addr; 2008 end = start + memlists[i].size; 2009 2010 if (map_debug) 2011 dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n", 2012 start, end); 2013 while (start < end && start < next_avail_addr) { 2014 map_pa_at_va(start, start, 0); 2015 start += MMU_PAGESIZE; 2016 } 2017 if (start >= next_avail_addr) 2018 break; 2019 } 2020 2021 /* 2022 * Map framebuffer memory as PT_NOCACHE as this is memory from a 2023 * device and therefore must not be cached. 2024 */ 2025 if (bi->bi_framebuffer != NULL) { 2026 multiboot_tag_framebuffer_t *fb; 2027 fb = (multiboot_tag_framebuffer_t *)(uintptr_t) 2028 bi->bi_framebuffer; 2029 2030 start = fb->framebuffer_common.framebuffer_addr; 2031 end = start + fb->framebuffer_common.framebuffer_height * 2032 fb->framebuffer_common.framebuffer_pitch; 2033 2034 pte_bits |= PT_NOCACHE; 2035 while (start < end) { 2036 map_pa_at_va(start, start, 0); 2037 start += MMU_PAGESIZE; 2038 } 2039 pte_bits &= ~PT_NOCACHE; 2040 } 2041 #endif /* !__xpv */ 2042 2043 DBG_MSG("\nPage tables constructed\n"); 2044 } 2045 2046 #define NO_MULTIBOOT \ 2047 "multiboot is no longer used to boot the Solaris Operating System.\n\ 2048 The grub entry should be changed to:\n\ 2049 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\ 2050 module$ /platform/i86pc/$ISADIR/boot_archive\n\ 2051 See http://illumos.org/msg/SUNOS-8000-AK for details.\n" 2052 2053 static void 2054 dboot_init_xboot_consinfo(void) 2055 { 2056 uintptr_t addr; 2057 /* 2058 * boot info must be 16 byte aligned for 64 bit kernel ABI 2059 */ 2060 addr = (uintptr_t)boot_info; 2061 addr = (addr + 0xf) & ~0xf; 2062 bi = (struct xboot_info *)addr; 2063 2064 #if !defined(__xpv) 2065 switch (multiboot_version) { 2066 case 1: 2067 dboot_multiboot1_xboot_consinfo(); 2068 break; 2069 case 2: 2070 dboot_multiboot2_xboot_consinfo(); 2071 break; 2072 default: 2073 dboot_panic("Unknown multiboot version: %d\n", 2074 multiboot_version); 2075 break; 2076 } 2077 /* 2078 * Lookup environment module for the console. Complete module list 2079 * will be built after console setup. 2080 */ 2081 dboot_find_env(); 2082 #endif 2083 } 2084 2085 /* 2086 * Set up basic data from the boot loader. 2087 * The load_addr is part of AOUT kludge setup in dboot_grub.s, to support 2088 * 32-bit dboot code setup used to set up and start 64-bit kernel. 2089 * AOUT kludge does allow 32-bit boot loader, such as grub1, to load and 2090 * start 64-bit illumos kernel. 2091 */ 2092 static void 2093 dboot_loader_init(void) 2094 { 2095 #if !defined(__xpv) 2096 mb_info = NULL; 2097 mb2_info = NULL; 2098 2099 switch (mb_magic) { 2100 case MB_BOOTLOADER_MAGIC: 2101 multiboot_version = 1; 2102 mb_info = (multiboot_info_t *)(uintptr_t)mb_addr; 2103 #if defined(_BOOT_TARGET_amd64) 2104 load_addr = mb_header.load_addr; 2105 #endif 2106 break; 2107 2108 case MULTIBOOT2_BOOTLOADER_MAGIC: 2109 multiboot_version = 2; 2110 mb2_info = (multiboot2_info_header_t *)(uintptr_t)mb_addr; 2111 mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mb2_info); 2112 #if defined(_BOOT_TARGET_amd64) 2113 load_addr = mb2_load_addr; 2114 #endif 2115 break; 2116 2117 default: 2118 dboot_panic("Unknown bootloader magic: 0x%x\n", mb_magic); 2119 break; 2120 } 2121 #endif /* !defined(__xpv) */ 2122 } 2123 2124 /* Extract the kernel command line from [multi]boot information. */ 2125 static char * 2126 dboot_loader_cmdline(void) 2127 { 2128 char *line = NULL; 2129 2130 #if defined(__xpv) 2131 line = (char *)xen_info->cmd_line; 2132 #else /* __xpv */ 2133 2134 switch (multiboot_version) { 2135 case 1: 2136 if (mb_info->flags & MB_INFO_CMDLINE) 2137 line = (char *)mb_info->cmdline; 2138 break; 2139 2140 case 2: 2141 line = dboot_multiboot2_cmdline(mb2_info); 2142 break; 2143 2144 default: 2145 dboot_panic("Unknown multiboot version: %d\n", 2146 multiboot_version); 2147 break; 2148 } 2149 2150 #endif /* __xpv */ 2151 2152 /* 2153 * Make sure we have valid pointer so the string operations 2154 * will not crash us. 2155 */ 2156 if (line == NULL) 2157 line = ""; 2158 2159 return (line); 2160 } 2161 2162 static char * 2163 dboot_loader_name(void) 2164 { 2165 #if defined(__xpv) 2166 return (NULL); 2167 #else /* __xpv */ 2168 multiboot_tag_string_t *tag; 2169 2170 switch (multiboot_version) { 2171 case 1: 2172 return ((char *)mb_info->boot_loader_name); 2173 2174 case 2: 2175 tag = dboot_multiboot2_find_tag(mb2_info, 2176 MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME); 2177 return (tag->mb_string); 2178 default: 2179 dboot_panic("Unknown multiboot version: %d\n", 2180 multiboot_version); 2181 break; 2182 } 2183 2184 return (NULL); 2185 #endif /* __xpv */ 2186 } 2187 2188 /* 2189 * startup_kernel has a pretty simple job. It builds pagetables which reflect 2190 * 1:1 mappings for all memory in use. It then also adds mappings for 2191 * the kernel nucleus at virtual address of target_kernel_text using large page 2192 * mappings. The page table pages are also accessible at 1:1 mapped 2193 * virtual addresses. 2194 */ 2195 /*ARGSUSED*/ 2196 void 2197 startup_kernel(void) 2198 { 2199 char *cmdline; 2200 char *bootloader; 2201 #if defined(__xpv) 2202 physdev_set_iopl_t set_iopl; 2203 #endif /* __xpv */ 2204 2205 dboot_loader_init(); 2206 /* 2207 * At this point we are executing in a 32 bit real mode. 2208 */ 2209 2210 bootloader = dboot_loader_name(); 2211 cmdline = dboot_loader_cmdline(); 2212 2213 #if defined(__xpv) 2214 /* 2215 * For dom0, before we initialize the console subsystem we'll 2216 * need to enable io operations, so set I/O priveldge level to 1. 2217 */ 2218 if (DOMAIN_IS_INITDOMAIN(xen_info)) { 2219 set_iopl.iopl = 1; 2220 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 2221 } 2222 #endif /* __xpv */ 2223 2224 dboot_init_xboot_consinfo(); 2225 bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline; 2226 bcons_init(bi); 2227 2228 prom_debug = (find_boot_prop("prom_debug") != NULL); 2229 map_debug = (find_boot_prop("map_debug") != NULL); 2230 2231 #if !defined(__xpv) 2232 dboot_multiboot_get_fwtables(); 2233 #endif 2234 DBG_MSG("\n\nillumos prekernel set: "); 2235 DBG_MSG(cmdline); 2236 DBG_MSG("\n"); 2237 2238 if (bootloader != NULL && prom_debug) { 2239 dboot_printf("Kernel loaded by: %s\n", bootloader); 2240 #if !defined(__xpv) 2241 dboot_printf("Using multiboot %d boot protocol.\n", 2242 multiboot_version); 2243 #endif 2244 } 2245 2246 if (strstr(cmdline, "multiboot") != NULL) { 2247 dboot_panic(NO_MULTIBOOT); 2248 } 2249 2250 DBG((uintptr_t)bi); 2251 #if !defined(__xpv) 2252 DBG((uintptr_t)mb_info); 2253 DBG((uintptr_t)mb2_info); 2254 if (mb2_info != NULL) 2255 DBG(mb2_info->mbi_total_size); 2256 DBG(bi->bi_acpi_rsdp); 2257 DBG(bi->bi_smbios); 2258 DBG(bi->bi_uefi_arch); 2259 DBG(bi->bi_uefi_systab); 2260 2261 if (bi->bi_uefi_systab && prom_debug) { 2262 if (bi->bi_uefi_arch == XBI_UEFI_ARCH_64) { 2263 print_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t) 2264 bi->bi_uefi_systab); 2265 } else { 2266 print_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t) 2267 bi->bi_uefi_systab); 2268 } 2269 } 2270 #endif 2271 2272 /* 2273 * Need correct target_kernel_text value 2274 */ 2275 #if defined(_BOOT_TARGET_amd64) 2276 target_kernel_text = KERNEL_TEXT_amd64; 2277 #elif defined(__xpv) 2278 target_kernel_text = KERNEL_TEXT_i386_xpv; 2279 #else 2280 target_kernel_text = KERNEL_TEXT_i386; 2281 #endif 2282 DBG(target_kernel_text); 2283 2284 #if defined(__xpv) 2285 2286 /* 2287 * XXPV Derive this stuff from CPUID / what the hypervisor has enabled 2288 */ 2289 2290 #if defined(_BOOT_TARGET_amd64) 2291 /* 2292 * 64-bit hypervisor. 2293 */ 2294 amd64_support = 1; 2295 pae_support = 1; 2296 2297 #else /* _BOOT_TARGET_amd64 */ 2298 2299 /* 2300 * See if we are running on a PAE Hypervisor 2301 */ 2302 { 2303 xen_capabilities_info_t caps; 2304 2305 if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0) 2306 dboot_panic("HYPERVISOR_xen_version(caps) failed"); 2307 caps[sizeof (caps) - 1] = 0; 2308 if (prom_debug) 2309 dboot_printf("xen capabilities %s\n", caps); 2310 if (strstr(caps, "x86_32p") != NULL) 2311 pae_support = 1; 2312 } 2313 2314 #endif /* _BOOT_TARGET_amd64 */ 2315 { 2316 xen_platform_parameters_t p; 2317 2318 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0) 2319 dboot_panic("HYPERVISOR_xen_version(parms) failed"); 2320 DBG(p.virt_start); 2321 mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start); 2322 } 2323 2324 /* 2325 * The hypervisor loads stuff starting at 1Gig 2326 */ 2327 mfn_base = ONE_GIG; 2328 DBG(mfn_base); 2329 2330 /* 2331 * enable writable page table mode for the hypervisor 2332 */ 2333 if (HYPERVISOR_vm_assist(VMASST_CMD_enable, 2334 VMASST_TYPE_writable_pagetables) < 0) 2335 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed"); 2336 2337 /* 2338 * check for NX support 2339 */ 2340 if (pae_support) { 2341 uint32_t eax = 0x80000000; 2342 uint32_t edx = get_cpuid_edx(&eax); 2343 2344 if (eax >= 0x80000001) { 2345 eax = 0x80000001; 2346 edx = get_cpuid_edx(&eax); 2347 if (edx & CPUID_AMD_EDX_NX) 2348 NX_support = 1; 2349 } 2350 } 2351 2352 #if !defined(_BOOT_TARGET_amd64) 2353 2354 /* 2355 * The 32-bit hypervisor uses segmentation to protect itself from 2356 * guests. This means when a guest attempts to install a flat 4GB 2357 * code or data descriptor the 32-bit hypervisor will protect itself 2358 * by silently shrinking the segment such that if the guest attempts 2359 * any access where the hypervisor lives a #gp fault is generated. 2360 * The problem is that some applications expect a full 4GB flat 2361 * segment for their current thread pointer and will use negative 2362 * offset segment wrap around to access data. TLS support in linux 2363 * brand is one example of this. 2364 * 2365 * The 32-bit hypervisor can catch the #gp fault in these cases 2366 * and emulate the access without passing the #gp fault to the guest 2367 * but only if VMASST_TYPE_4gb_segments is explicitly turned on. 2368 * Seems like this should have been the default. 2369 * Either way, we want the hypervisor -- and not Solaris -- to deal 2370 * to deal with emulating these accesses. 2371 */ 2372 if (HYPERVISOR_vm_assist(VMASST_CMD_enable, 2373 VMASST_TYPE_4gb_segments) < 0) 2374 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed"); 2375 #endif /* !_BOOT_TARGET_amd64 */ 2376 2377 #else /* __xpv */ 2378 2379 /* 2380 * use cpuid to enable MMU features 2381 */ 2382 if (have_cpuid()) { 2383 uint32_t eax, edx; 2384 2385 eax = 1; 2386 edx = get_cpuid_edx(&eax); 2387 if (edx & CPUID_INTC_EDX_PSE) 2388 largepage_support = 1; 2389 if (edx & CPUID_INTC_EDX_PGE) 2390 pge_support = 1; 2391 if (edx & CPUID_INTC_EDX_PAE) 2392 pae_support = 1; 2393 2394 eax = 0x80000000; 2395 edx = get_cpuid_edx(&eax); 2396 if (eax >= 0x80000001) { 2397 eax = 0x80000001; 2398 edx = get_cpuid_edx(&eax); 2399 if (edx & CPUID_AMD_EDX_LM) 2400 amd64_support = 1; 2401 if (edx & CPUID_AMD_EDX_NX) 2402 NX_support = 1; 2403 } 2404 } else { 2405 dboot_printf("cpuid not supported\n"); 2406 } 2407 #endif /* __xpv */ 2408 2409 2410 #if defined(_BOOT_TARGET_amd64) 2411 if (amd64_support == 0) 2412 dboot_panic("long mode not supported, rebooting"); 2413 else if (pae_support == 0) 2414 dboot_panic("long mode, but no PAE; rebooting"); 2415 #else 2416 /* 2417 * Allow the command line to over-ride use of PAE for 32 bit. 2418 */ 2419 if (strstr(cmdline, "disablePAE=true") != NULL) { 2420 pae_support = 0; 2421 NX_support = 0; 2422 amd64_support = 0; 2423 } 2424 #endif 2425 2426 /* 2427 * initialize the simple memory allocator 2428 */ 2429 init_mem_alloc(); 2430 2431 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64) 2432 /* 2433 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory 2434 */ 2435 if (max_mem < FOUR_GIG && NX_support == 0) 2436 pae_support = 0; 2437 #endif 2438 2439 /* 2440 * configure mmu information 2441 */ 2442 if (pae_support) { 2443 shift_amt = shift_amt_pae; 2444 ptes_per_table = 512; 2445 pte_size = 8; 2446 lpagesize = TWO_MEG; 2447 #if defined(_BOOT_TARGET_amd64) 2448 top_level = 3; 2449 #else 2450 top_level = 2; 2451 #endif 2452 } else { 2453 pae_support = 0; 2454 NX_support = 0; 2455 shift_amt = shift_amt_nopae; 2456 ptes_per_table = 1024; 2457 pte_size = 4; 2458 lpagesize = FOUR_MEG; 2459 top_level = 1; 2460 } 2461 2462 DBG(pge_support); 2463 DBG(NX_support); 2464 DBG(largepage_support); 2465 DBG(amd64_support); 2466 DBG(top_level); 2467 DBG(pte_size); 2468 DBG(ptes_per_table); 2469 DBG(lpagesize); 2470 2471 #if defined(__xpv) 2472 ktext_phys = ONE_GIG; /* from UNIX Mapfile */ 2473 #else 2474 ktext_phys = FOUR_MEG; /* from UNIX Mapfile */ 2475 #endif 2476 2477 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64) 2478 /* 2479 * For grub, copy kernel bits from the ELF64 file to final place. 2480 */ 2481 DBG_MSG("\nAllocating nucleus pages.\n"); 2482 ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG); 2483 2484 if (ktext_phys == 0) 2485 dboot_panic("failed to allocate aligned kernel memory"); 2486 DBG(load_addr); 2487 if (dboot_elfload64(load_addr) != 0) 2488 dboot_panic("failed to parse kernel ELF image, rebooting"); 2489 #endif 2490 2491 DBG(ktext_phys); 2492 2493 /* 2494 * Allocate page tables. 2495 */ 2496 build_page_tables(); 2497 2498 /* 2499 * return to assembly code to switch to running kernel 2500 */ 2501 entry_addr_low = (uint32_t)target_kernel_text; 2502 DBG(entry_addr_low); 2503 bi->bi_use_largepage = largepage_support; 2504 bi->bi_use_pae = pae_support; 2505 bi->bi_use_pge = pge_support; 2506 bi->bi_use_nx = NX_support; 2507 2508 #if defined(__xpv) 2509 2510 bi->bi_next_paddr = next_avail_addr - mfn_base; 2511 DBG(bi->bi_next_paddr); 2512 bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr; 2513 DBG(bi->bi_next_vaddr); 2514 2515 /* 2516 * unmap unused pages in start area to make them available for DMA 2517 */ 2518 while (next_avail_addr < scratch_end) { 2519 (void) HYPERVISOR_update_va_mapping(next_avail_addr, 2520 0, UVMF_INVLPG | UVMF_LOCAL); 2521 next_avail_addr += MMU_PAGESIZE; 2522 } 2523 2524 bi->bi_xen_start_info = (native_ptr_t)(uintptr_t)xen_info; 2525 DBG((uintptr_t)HYPERVISOR_shared_info); 2526 bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info; 2527 bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base; 2528 2529 #else /* __xpv */ 2530 2531 bi->bi_next_paddr = next_avail_addr; 2532 DBG(bi->bi_next_paddr); 2533 bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr; 2534 DBG(bi->bi_next_vaddr); 2535 bi->bi_mb_version = multiboot_version; 2536 2537 switch (multiboot_version) { 2538 case 1: 2539 bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb_info; 2540 break; 2541 case 2: 2542 bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb2_info; 2543 break; 2544 default: 2545 dboot_panic("Unknown multiboot version: %d\n", 2546 multiboot_version); 2547 break; 2548 } 2549 bi->bi_top_page_table = (uintptr_t)top_page_table; 2550 2551 #endif /* __xpv */ 2552 2553 bi->bi_kseg_size = FOUR_MEG; 2554 DBG(bi->bi_kseg_size); 2555 2556 #ifndef __xpv 2557 if (map_debug) 2558 dump_tables(); 2559 #endif 2560 2561 DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n"); 2562 } 2563