1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * Copyright 2013 Joyent, Inc. All rights reserved. 27 */ 28 29 30 #include <sys/types.h> 31 #include <sys/machparam.h> 32 #include <sys/x86_archext.h> 33 #include <sys/systm.h> 34 #include <sys/mach_mmu.h> 35 #include <sys/multiboot.h> 36 #include <sys/sha1.h> 37 #include <util/string.h> 38 #include <util/strtolctype.h> 39 40 #if defined(__xpv) 41 42 #include <sys/hypervisor.h> 43 uintptr_t xen_virt_start; 44 pfn_t *mfn_to_pfn_mapping; 45 46 #else /* !__xpv */ 47 48 extern multiboot_header_t mb_header; 49 extern int have_cpuid(void); 50 51 #endif /* !__xpv */ 52 53 #include <sys/inttypes.h> 54 #include <sys/bootinfo.h> 55 #include <sys/mach_mmu.h> 56 #include <sys/boot_console.h> 57 58 #include "dboot_asm.h" 59 #include "dboot_printf.h" 60 #include "dboot_xboot.h" 61 #include "dboot_elfload.h" 62 63 #define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2) 64 65 /* 66 * This file contains code that runs to transition us from either a multiboot 67 * compliant loader (32 bit non-paging) or a XPV domain loader to 68 * regular kernel execution. Its task is to setup the kernel memory image 69 * and page tables. 70 * 71 * The code executes as: 72 * - 32 bits under GRUB (for 32 or 64 bit Solaris) 73 * - a 32 bit program for the 32-bit PV hypervisor 74 * - a 64 bit program for the 64-bit PV hypervisor (at least for now) 75 * 76 * Under the PV hypervisor, we must create mappings for any memory beyond the 77 * initial start of day allocation (such as the kernel itself). 78 * 79 * When on the metal, the mapping between maddr_t and paddr_t is 1:1. 80 * Since we are running in real mode, so all such memory is accessible. 81 */ 82 83 /* 84 * Standard bits used in PTE (page level) and PTP (internal levels) 85 */ 86 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER; 87 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST; 88 89 /* 90 * This is the target addresses (physical) where the kernel text and data 91 * nucleus pages will be unpacked. On the hypervisor this is actually a 92 * virtual address. 93 */ 94 paddr_t ktext_phys; 95 uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */ 96 97 static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */ 98 99 /* 100 * The stack is setup in assembler before entering startup_kernel() 101 */ 102 char stack_space[STACK_SIZE]; 103 104 /* 105 * Used to track physical memory allocation 106 */ 107 static paddr_t next_avail_addr = 0; 108 109 #if defined(__xpv) 110 /* 111 * Additional information needed for hypervisor memory allocation. 112 * Only memory up to scratch_end is mapped by page tables. 113 * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so 114 * to derive a pfn from a pointer, you subtract mfn_base. 115 */ 116 117 static paddr_t scratch_end = 0; /* we can't write all of mem here */ 118 static paddr_t mfn_base; /* addr corresponding to mfn_list[0] */ 119 start_info_t *xen_info; 120 121 #else /* __xpv */ 122 123 /* 124 * If on the metal, then we have a multiboot loader. 125 */ 126 multiboot_info_t *mb_info; 127 128 #endif /* __xpv */ 129 130 /* 131 * This contains information passed to the kernel 132 */ 133 struct xboot_info boot_info[2]; /* extra space to fix alignement for amd64 */ 134 struct xboot_info *bi; 135 136 /* 137 * Page table and memory stuff. 138 */ 139 static paddr_t max_mem; /* maximum memory address */ 140 141 /* 142 * Information about processor MMU 143 */ 144 int amd64_support = 0; 145 int largepage_support = 0; 146 int pae_support = 0; 147 int pge_support = 0; 148 int NX_support = 0; 149 150 /* 151 * Low 32 bits of kernel entry address passed back to assembler. 152 * When running a 64 bit kernel, the high 32 bits are 0xffffffff. 153 */ 154 uint32_t entry_addr_low; 155 156 /* 157 * Memlists for the kernel. We shouldn't need a lot of these. 158 */ 159 #define MAX_MEMLIST (50) 160 struct boot_memlist memlists[MAX_MEMLIST]; 161 uint_t memlists_used = 0; 162 struct boot_memlist pcimemlists[MAX_MEMLIST]; 163 uint_t pcimemlists_used = 0; 164 struct boot_memlist rsvdmemlists[MAX_MEMLIST]; 165 uint_t rsvdmemlists_used = 0; 166 167 /* 168 * This should match what's in the bootloader. It's arbitrary, but GRUB 169 * in particular has limitations on how much space it can use before it 170 * stops working properly. This should be enough. 171 */ 172 struct boot_modules modules[MAX_BOOT_MODULES]; 173 uint_t modules_used = 0; 174 175 /* 176 * Debugging macros 177 */ 178 uint_t prom_debug = 0; 179 uint_t map_debug = 0; 180 181 static char noname[2] = "-"; 182 183 /* 184 * Either hypervisor-specific or grub-specific code builds the initial 185 * memlists. This code does the sort/merge/link for final use. 186 */ 187 static void 188 sort_physinstall(void) 189 { 190 int i; 191 #if !defined(__xpv) 192 int j; 193 struct boot_memlist tmp; 194 195 /* 196 * Now sort the memlists, in case they weren't in order. 197 * Yeah, this is a bubble sort; small, simple and easy to get right. 198 */ 199 DBG_MSG("Sorting phys-installed list\n"); 200 for (j = memlists_used - 1; j > 0; --j) { 201 for (i = 0; i < j; ++i) { 202 if (memlists[i].addr < memlists[i + 1].addr) 203 continue; 204 tmp = memlists[i]; 205 memlists[i] = memlists[i + 1]; 206 memlists[i + 1] = tmp; 207 } 208 } 209 210 /* 211 * Merge any memlists that don't have holes between them. 212 */ 213 for (i = 0; i <= memlists_used - 1; ++i) { 214 if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr) 215 continue; 216 217 if (prom_debug) 218 dboot_printf( 219 "merging mem segs %" PRIx64 "...%" PRIx64 220 " w/ %" PRIx64 "...%" PRIx64 "\n", 221 memlists[i].addr, 222 memlists[i].addr + memlists[i].size, 223 memlists[i + 1].addr, 224 memlists[i + 1].addr + memlists[i + 1].size); 225 226 memlists[i].size += memlists[i + 1].size; 227 for (j = i + 1; j < memlists_used - 1; ++j) 228 memlists[j] = memlists[j + 1]; 229 --memlists_used; 230 DBG(memlists_used); 231 --i; /* after merging we need to reexamine, so do this */ 232 } 233 #endif /* __xpv */ 234 235 if (prom_debug) { 236 dboot_printf("\nFinal memlists:\n"); 237 for (i = 0; i < memlists_used; ++i) { 238 dboot_printf("\t%d: addr=%" PRIx64 " size=%" 239 PRIx64 "\n", i, memlists[i].addr, memlists[i].size); 240 } 241 } 242 243 /* 244 * link together the memlists with native size pointers 245 */ 246 memlists[0].next = 0; 247 memlists[0].prev = 0; 248 for (i = 1; i < memlists_used; ++i) { 249 memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1); 250 memlists[i].next = 0; 251 memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i); 252 } 253 bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists; 254 DBG(bi->bi_phys_install); 255 } 256 257 /* 258 * build bios reserved memlists 259 */ 260 static void 261 build_rsvdmemlists(void) 262 { 263 int i; 264 265 rsvdmemlists[0].next = 0; 266 rsvdmemlists[0].prev = 0; 267 for (i = 1; i < rsvdmemlists_used; ++i) { 268 rsvdmemlists[i].prev = 269 (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1); 270 rsvdmemlists[i].next = 0; 271 rsvdmemlists[i - 1].next = 272 (native_ptr_t)(uintptr_t)(rsvdmemlists + i); 273 } 274 bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists; 275 DBG(bi->bi_rsvdmem); 276 } 277 278 #if defined(__xpv) 279 280 /* 281 * halt on the hypervisor after a delay to drain console output 282 */ 283 void 284 dboot_halt(void) 285 { 286 uint_t i = 10000; 287 288 while (--i) 289 (void) HYPERVISOR_yield(); 290 (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff); 291 } 292 293 /* 294 * From a machine address, find the corresponding pseudo-physical address. 295 * Pseudo-physical address are contiguous and run from mfn_base in each VM. 296 * Machine addresses are the real underlying hardware addresses. 297 * These are needed for page table entries. Note that this routine is 298 * poorly protected. A bad value of "ma" will cause a page fault. 299 */ 300 paddr_t 301 ma_to_pa(maddr_t ma) 302 { 303 ulong_t pgoff = ma & MMU_PAGEOFFSET; 304 ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)]; 305 paddr_t pa; 306 307 if (pfn >= xen_info->nr_pages) 308 return (-(paddr_t)1); 309 pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff; 310 #ifdef DEBUG 311 if (ma != pa_to_ma(pa)) 312 dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", " 313 "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa)); 314 #endif 315 return (pa); 316 } 317 318 /* 319 * From a pseudo-physical address, find the corresponding machine address. 320 */ 321 maddr_t 322 pa_to_ma(paddr_t pa) 323 { 324 pfn_t pfn; 325 ulong_t mfn; 326 327 pfn = mmu_btop(pa - mfn_base); 328 if (pa < mfn_base || pfn >= xen_info->nr_pages) 329 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa); 330 mfn = ((ulong_t *)xen_info->mfn_list)[pfn]; 331 #ifdef DEBUG 332 if (mfn_to_pfn_mapping[mfn] != pfn) 333 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n", 334 pfn, mfn, mfn_to_pfn_mapping[mfn]); 335 #endif 336 return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET)); 337 } 338 339 #endif /* __xpv */ 340 341 x86pte_t 342 get_pteval(paddr_t table, uint_t index) 343 { 344 if (pae_support) 345 return (((x86pte_t *)(uintptr_t)table)[index]); 346 return (((x86pte32_t *)(uintptr_t)table)[index]); 347 } 348 349 /*ARGSUSED*/ 350 void 351 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval) 352 { 353 #ifdef __xpv 354 mmu_update_t t; 355 maddr_t mtable = pa_to_ma(table); 356 int retcnt; 357 358 t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE; 359 t.val = pteval; 360 if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1) 361 dboot_panic("HYPERVISOR_mmu_update() failed"); 362 #else /* __xpv */ 363 uintptr_t tab_addr = (uintptr_t)table; 364 365 if (pae_support) 366 ((x86pte_t *)tab_addr)[index] = pteval; 367 else 368 ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval; 369 if (level == top_level && level == 2) 370 reload_cr3(); 371 #endif /* __xpv */ 372 } 373 374 paddr_t 375 make_ptable(x86pte_t *pteval, uint_t level) 376 { 377 paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); 378 379 if (level == top_level && level == 2) 380 *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID; 381 else 382 *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits; 383 384 #ifdef __xpv 385 /* Remove write permission to the new page table. */ 386 if (HYPERVISOR_update_va_mapping(new_table, 387 *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL)) 388 dboot_panic("HYP_update_va_mapping error"); 389 #endif 390 391 if (map_debug) 392 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%" 393 PRIx64 "\n", level, (ulong_t)new_table, *pteval); 394 return (new_table); 395 } 396 397 x86pte_t * 398 map_pte(paddr_t table, uint_t index) 399 { 400 return ((x86pte_t *)(uintptr_t)(table + index * pte_size)); 401 } 402 403 /* 404 * dump out the contents of page tables... 405 */ 406 static void 407 dump_tables(void) 408 { 409 uint_t save_index[4]; /* for recursion */ 410 char *save_table[4]; /* for recursion */ 411 uint_t l; 412 uint64_t va; 413 uint64_t pgsize; 414 int index; 415 int i; 416 x86pte_t pteval; 417 char *table; 418 static char *tablist = "\t\t\t"; 419 char *tabs = tablist + 3 - top_level; 420 uint_t pa, pa1; 421 #if !defined(__xpv) 422 #define maddr_t paddr_t 423 #endif /* !__xpv */ 424 425 dboot_printf("Finished pagetables:\n"); 426 table = (char *)(uintptr_t)top_page_table; 427 l = top_level; 428 va = 0; 429 for (index = 0; index < ptes_per_table; ++index) { 430 pgsize = 1ull << shift_amt[l]; 431 if (pae_support) 432 pteval = ((x86pte_t *)table)[index]; 433 else 434 pteval = ((x86pte32_t *)table)[index]; 435 if (pteval == 0) 436 goto next_entry; 437 438 dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64, 439 tabs + l, (void *)table, index, (uint64_t)pteval, va); 440 pa = ma_to_pa(pteval & MMU_PAGEMASK); 441 dboot_printf(" physaddr=%x\n", pa); 442 443 /* 444 * Don't try to walk hypervisor private pagetables 445 */ 446 if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) { 447 save_table[l] = table; 448 save_index[l] = index; 449 --l; 450 index = -1; 451 table = (char *)(uintptr_t) 452 ma_to_pa(pteval & MMU_PAGEMASK); 453 goto recursion; 454 } 455 456 /* 457 * shorten dump for consecutive mappings 458 */ 459 for (i = 1; index + i < ptes_per_table; ++i) { 460 if (pae_support) 461 pteval = ((x86pte_t *)table)[index + i]; 462 else 463 pteval = ((x86pte32_t *)table)[index + i]; 464 if (pteval == 0) 465 break; 466 pa1 = ma_to_pa(pteval & MMU_PAGEMASK); 467 if (pa1 != pa + i * pgsize) 468 break; 469 } 470 if (i > 2) { 471 dboot_printf("%s...\n", tabs + l); 472 va += pgsize * (i - 2); 473 index += i - 2; 474 } 475 next_entry: 476 va += pgsize; 477 if (l == 3 && index == 256) /* VA hole */ 478 va = 0xffff800000000000ull; 479 recursion: 480 ; 481 } 482 if (l < top_level) { 483 ++l; 484 index = save_index[l]; 485 table = save_table[l]; 486 goto recursion; 487 } 488 } 489 490 /* 491 * Add a mapping for the machine page at the given virtual address. 492 */ 493 static void 494 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level) 495 { 496 x86pte_t *ptep; 497 x86pte_t pteval; 498 499 pteval = ma | pte_bits; 500 if (level > 0) 501 pteval |= PT_PAGESIZE; 502 if (va >= target_kernel_text && pge_support) 503 pteval |= PT_GLOBAL; 504 505 if (map_debug && ma != va) 506 dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64 507 " pte=0x%" PRIx64 " l=%d\n", 508 (uint64_t)ma, (uint64_t)va, pteval, level); 509 510 #if defined(__xpv) 511 /* 512 * see if we can avoid find_pte() on the hypervisor 513 */ 514 if (HYPERVISOR_update_va_mapping(va, pteval, 515 UVMF_INVLPG | UVMF_LOCAL) == 0) 516 return; 517 #endif 518 519 /* 520 * Find the pte that will map this address. This creates any 521 * missing intermediate level page tables 522 */ 523 ptep = find_pte(va, NULL, level, 0); 524 525 /* 526 * When paravirtualized, we must use hypervisor calls to modify the 527 * PTE, since paging is active. On real hardware we just write to 528 * the pagetables which aren't in use yet. 529 */ 530 #if defined(__xpv) 531 ptep = ptep; /* shut lint up */ 532 if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL)) 533 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64 534 " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "", 535 (uint64_t)va, level, (uint64_t)ma, pteval); 536 #else 537 if (va < 1024 * 1024) 538 pteval |= PT_NOCACHE; /* for video RAM */ 539 if (pae_support) 540 *ptep = pteval; 541 else 542 *((x86pte32_t *)ptep) = (x86pte32_t)pteval; 543 #endif 544 } 545 546 /* 547 * Add a mapping for the physical page at the given virtual address. 548 */ 549 static void 550 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level) 551 { 552 map_ma_at_va(pa_to_ma(pa), va, level); 553 } 554 555 /* 556 * This is called to remove start..end from the 557 * possible range of PCI addresses. 558 */ 559 const uint64_t pci_lo_limit = 0x00100000ul; 560 const uint64_t pci_hi_limit = 0xfff00000ul; 561 static void 562 exclude_from_pci(uint64_t start, uint64_t end) 563 { 564 int i; 565 int j; 566 struct boot_memlist *ml; 567 568 for (i = 0; i < pcimemlists_used; ++i) { 569 ml = &pcimemlists[i]; 570 571 /* delete the entire range? */ 572 if (start <= ml->addr && ml->addr + ml->size <= end) { 573 --pcimemlists_used; 574 for (j = i; j < pcimemlists_used; ++j) 575 pcimemlists[j] = pcimemlists[j + 1]; 576 --i; /* to revisit the new one at this index */ 577 } 578 579 /* split a range? */ 580 else if (ml->addr < start && end < ml->addr + ml->size) { 581 582 ++pcimemlists_used; 583 if (pcimemlists_used > MAX_MEMLIST) 584 dboot_panic("too many pcimemlists"); 585 586 for (j = pcimemlists_used - 1; j > i; --j) 587 pcimemlists[j] = pcimemlists[j - 1]; 588 ml->size = start - ml->addr; 589 590 ++ml; 591 ml->size = (ml->addr + ml->size) - end; 592 ml->addr = end; 593 ++i; /* skip on to next one */ 594 } 595 596 /* cut memory off the start? */ 597 else if (ml->addr < end && end < ml->addr + ml->size) { 598 ml->size -= end - ml->addr; 599 ml->addr = end; 600 } 601 602 /* cut memory off the end? */ 603 else if (ml->addr <= start && start < ml->addr + ml->size) { 604 ml->size = start - ml->addr; 605 } 606 } 607 } 608 609 /* 610 * Xen strips the size field out of the mb_memory_map_t, see struct e820entry 611 * definition in Xen source. 612 */ 613 #ifdef __xpv 614 typedef struct { 615 uint32_t base_addr_low; 616 uint32_t base_addr_high; 617 uint32_t length_low; 618 uint32_t length_high; 619 uint32_t type; 620 } mmap_t; 621 #else 622 typedef mb_memory_map_t mmap_t; 623 #endif 624 625 static void 626 build_pcimemlists(mmap_t *mem, int num) 627 { 628 mmap_t *mmap; 629 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */ 630 uint64_t start; 631 uint64_t end; 632 int i; 633 634 /* 635 * initialize 636 */ 637 pcimemlists[0].addr = pci_lo_limit; 638 pcimemlists[0].size = pci_hi_limit - pci_lo_limit; 639 pcimemlists_used = 1; 640 641 /* 642 * Fill in PCI memlists. 643 */ 644 for (mmap = mem, i = 0; i < num; ++i, ++mmap) { 645 start = ((uint64_t)mmap->base_addr_high << 32) + 646 mmap->base_addr_low; 647 end = start + ((uint64_t)mmap->length_high << 32) + 648 mmap->length_low; 649 650 if (prom_debug) 651 dboot_printf("\ttype: %d %" PRIx64 "..%" 652 PRIx64 "\n", mmap->type, start, end); 653 654 /* 655 * page align start and end 656 */ 657 start = (start + page_offset) & ~page_offset; 658 end &= ~page_offset; 659 if (end <= start) 660 continue; 661 662 exclude_from_pci(start, end); 663 } 664 665 /* 666 * Finish off the pcimemlist 667 */ 668 if (prom_debug) { 669 for (i = 0; i < pcimemlists_used; ++i) { 670 dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%" 671 PRIx64 "\n", pcimemlists[i].addr, 672 pcimemlists[i].addr + pcimemlists[i].size); 673 } 674 } 675 pcimemlists[0].next = 0; 676 pcimemlists[0].prev = 0; 677 for (i = 1; i < pcimemlists_used; ++i) { 678 pcimemlists[i].prev = 679 (native_ptr_t)(uintptr_t)(pcimemlists + i - 1); 680 pcimemlists[i].next = 0; 681 pcimemlists[i - 1].next = 682 (native_ptr_t)(uintptr_t)(pcimemlists + i); 683 } 684 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists; 685 DBG(bi->bi_pcimem); 686 } 687 688 #if defined(__xpv) 689 /* 690 * Initialize memory allocator stuff from hypervisor-supplied start info. 691 * 692 * There is 512KB of scratch area after the boot stack page. 693 * We'll use that for everything except the kernel nucleus pages which are too 694 * big to fit there and are allocated last anyway. 695 */ 696 #define MAXMAPS 100 697 static mmap_t map_buffer[MAXMAPS]; 698 static void 699 init_mem_alloc(void) 700 { 701 int local; /* variables needed to find start region */ 702 paddr_t scratch_start; 703 xen_memory_map_t map; 704 705 DBG_MSG("Entered init_mem_alloc()\n"); 706 707 /* 708 * Free memory follows the stack. There's at least 512KB of scratch 709 * space, rounded up to at least 2Mb alignment. That should be enough 710 * for the page tables we'll need to build. The nucleus memory is 711 * allocated last and will be outside the addressible range. We'll 712 * switch to new page tables before we unpack the kernel 713 */ 714 scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE); 715 DBG(scratch_start); 716 scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG); 717 DBG(scratch_end); 718 719 /* 720 * For paranoia, leave some space between hypervisor data and ours. 721 * Use 500 instead of 512. 722 */ 723 next_avail_addr = scratch_end - 500 * 1024; 724 DBG(next_avail_addr); 725 726 /* 727 * The domain builder gives us at most 1 module 728 */ 729 DBG(xen_info->mod_len); 730 if (xen_info->mod_len > 0) { 731 DBG(xen_info->mod_start); 732 modules[0].bm_addr = xen_info->mod_start; 733 modules[0].bm_size = xen_info->mod_len; 734 bi->bi_module_cnt = 1; 735 bi->bi_modules = (native_ptr_t)modules; 736 } else { 737 bi->bi_module_cnt = 0; 738 bi->bi_modules = NULL; 739 } 740 DBG(bi->bi_module_cnt); 741 DBG(bi->bi_modules); 742 743 DBG(xen_info->mfn_list); 744 DBG(xen_info->nr_pages); 745 max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT; 746 DBG(max_mem); 747 748 /* 749 * Using pseudo-physical addresses, so only 1 memlist element 750 */ 751 memlists[0].addr = 0; 752 DBG(memlists[0].addr); 753 memlists[0].size = max_mem; 754 DBG(memlists[0].size); 755 memlists_used = 1; 756 DBG(memlists_used); 757 758 /* 759 * finish building physinstall list 760 */ 761 sort_physinstall(); 762 763 /* 764 * build bios reserved memlists 765 */ 766 build_rsvdmemlists(); 767 768 if (DOMAIN_IS_INITDOMAIN(xen_info)) { 769 /* 770 * build PCI Memory list 771 */ 772 map.nr_entries = MAXMAPS; 773 /*LINTED: constant in conditional context*/ 774 set_xen_guest_handle(map.buffer, map_buffer); 775 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0) 776 dboot_panic("getting XENMEM_machine_memory_map failed"); 777 build_pcimemlists(map_buffer, map.nr_entries); 778 } 779 } 780 781 #else /* !__xpv */ 782 783 static uint8_t 784 dboot_a2h(char v) 785 { 786 if (v >= 'a') 787 return (v - 'a' + 0xa); 788 else if (v >= 'A') 789 return (v - 'A' + 0xa); 790 else if (v >= '0') 791 return (v - '0'); 792 else 793 dboot_panic("bad ASCII hex character %c\n", v); 794 795 return (0); 796 } 797 798 static void 799 digest_a2h(const char *ascii, uint8_t *digest) 800 { 801 unsigned int i; 802 803 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) { 804 digest[i] = dboot_a2h(ascii[i * 2]) << 4; 805 digest[i] |= dboot_a2h(ascii[i * 2 + 1]); 806 } 807 } 808 809 /* 810 * Generate a SHA-1 hash of the first len bytes of image, and compare it with 811 * the ASCII-format hash found in the 40-byte buffer at ascii. If they 812 * match, return 0, otherwise -1. This works only for images smaller than 813 * 4 GB, which should not be a problem. 814 */ 815 static int 816 check_image_hash(uint_t midx) 817 { 818 const char *ascii; 819 const void *image; 820 size_t len; 821 SHA1_CTX ctx; 822 uint8_t digest[SHA1_DIGEST_LENGTH]; 823 uint8_t baseline[SHA1_DIGEST_LENGTH]; 824 unsigned int i; 825 826 ascii = (const char *)(uintptr_t)modules[midx].bm_hash; 827 image = (const void *)(uintptr_t)modules[midx].bm_addr; 828 len = (size_t)modules[midx].bm_size; 829 830 digest_a2h(ascii, baseline); 831 832 SHA1Init(&ctx); 833 SHA1Update(&ctx, image, len); 834 SHA1Final(digest, &ctx); 835 836 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) { 837 if (digest[i] != baseline[i]) 838 return (-1); 839 } 840 841 return (0); 842 } 843 844 static const char * 845 type_to_str(boot_module_type_t type) 846 { 847 switch (type) { 848 case BMT_ROOTFS: 849 return ("rootfs"); 850 case BMT_FILE: 851 return ("file"); 852 case BMT_HASH: 853 return ("hash"); 854 default: 855 return ("unknown"); 856 } 857 } 858 859 static void 860 check_images(void) 861 { 862 uint_t i; 863 char displayhash[SHA1_ASCII_LENGTH + 1]; 864 865 for (i = 0; i < modules_used; i++) { 866 if (prom_debug) { 867 dboot_printf("module #%d: name %s type %s " 868 "addr %lx size %lx\n", 869 i, (char *)(uintptr_t)modules[i].bm_name, 870 type_to_str(modules[i].bm_type), 871 (ulong_t)modules[i].bm_addr, 872 (ulong_t)modules[i].bm_size); 873 } 874 875 if (modules[i].bm_type == BMT_HASH || 876 modules[i].bm_hash == NULL) { 877 DBG_MSG("module has no hash; skipping check\n"); 878 continue; 879 } 880 (void) memcpy(displayhash, 881 (void *)(uintptr_t)modules[i].bm_hash, 882 SHA1_ASCII_LENGTH); 883 displayhash[SHA1_ASCII_LENGTH] = '\0'; 884 if (prom_debug) { 885 dboot_printf("checking expected hash [%s]: ", 886 displayhash); 887 } 888 889 if (check_image_hash(i) != 0) 890 dboot_panic("hash mismatch!\n"); 891 else 892 DBG_MSG("OK\n"); 893 } 894 } 895 896 /* 897 * Determine the module's starting address, size, name, and type, and fill the 898 * boot_modules structure. This structure is used by the bop code, except for 899 * hashes which are checked prior to transferring control to the kernel. 900 */ 901 static void 902 process_module(mb_module_t *mod) 903 { 904 int midx = modules_used++; 905 char *p, *q; 906 907 if (prom_debug) { 908 dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n", 909 midx, (char *)(mod->mod_name), 910 (ulong_t)mod->mod_start, (ulong_t)mod->mod_end); 911 } 912 913 if (mod->mod_start > mod->mod_end) { 914 dboot_panic("module #%d: module start address 0x%lx greater " 915 "than end address 0x%lx", midx, 916 (ulong_t)mod->mod_start, (ulong_t)mod->mod_end); 917 } 918 919 /* 920 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes 921 * the address of the last valid byte in a module plus 1 as mod_end. 922 * This is of course a bug; the multiboot specification simply states 923 * that mod_start and mod_end "contain the start and end addresses of 924 * the boot module itself" which is pretty obviously not what GRUB is 925 * doing. However, fixing it requires that not only this code be 926 * changed but also that other code consuming this value and values 927 * derived from it be fixed, and that the kernel and GRUB must either 928 * both have the bug or neither. While there are a lot of combinations 929 * that will work, there are also some that won't, so for simplicity 930 * we'll just cope with the bug. That means we won't actually hash the 931 * byte at mod_end, and we will expect that mod_end for the hash file 932 * itself is one greater than some multiple of 41 (40 bytes of ASCII 933 * hash plus a newline for each module). We set bm_size to the true 934 * correct number of bytes in each module, achieving exactly this. 935 */ 936 937 modules[midx].bm_addr = mod->mod_start; 938 modules[midx].bm_size = mod->mod_end - mod->mod_start; 939 modules[midx].bm_name = mod->mod_name; 940 modules[midx].bm_hash = NULL; 941 modules[midx].bm_type = BMT_FILE; 942 943 if (mod->mod_name == NULL) { 944 modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname; 945 return; 946 } 947 948 p = (char *)(uintptr_t)mod->mod_name; 949 modules[midx].bm_name = 950 (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r"); 951 952 while (p != NULL) { 953 q = strsep(&p, " \t\f\n\r"); 954 if (strncmp(q, "name=", 5) == 0) { 955 if (q[5] != '\0' && !isspace(q[5])) { 956 modules[midx].bm_name = 957 (native_ptr_t)(uintptr_t)(q + 5); 958 } 959 continue; 960 } 961 962 if (strncmp(q, "type=", 5) == 0) { 963 if (q[5] == '\0' || isspace(q[5])) 964 continue; 965 q += 5; 966 if (strcmp(q, "rootfs") == 0) { 967 modules[midx].bm_type = BMT_ROOTFS; 968 } else if (strcmp(q, "hash") == 0) { 969 modules[midx].bm_type = BMT_HASH; 970 } else if (strcmp(q, "file") != 0) { 971 dboot_printf("\tmodule #%d: unknown module " 972 "type '%s'; defaulting to 'file'", 973 midx, q); 974 } 975 continue; 976 } 977 978 if (strncmp(q, "hash=", 5) == 0) { 979 if (q[5] != '\0' && !isspace(q[5])) { 980 modules[midx].bm_hash = 981 (native_ptr_t)(uintptr_t)(q + 5); 982 } 983 continue; 984 } 985 986 dboot_printf("ignoring unknown option '%s'\n", q); 987 } 988 } 989 990 /* 991 * Backward compatibility: if there are exactly one or two modules, both 992 * of type 'file' and neither with an embedded hash value, we have been 993 * given the legacy style modules. In this case we need to treat the first 994 * module as a rootfs and the second as a hash referencing that module. 995 * Otherwise, even if the configuration is invalid, we assume that the 996 * operator knows what he's doing or at least isn't being bitten by this 997 * interface change. 998 */ 999 static void 1000 fixup_modules(void) 1001 { 1002 if (modules_used == 0 || modules_used > 2) 1003 return; 1004 1005 if (modules[0].bm_type != BMT_FILE || 1006 modules_used > 1 && modules[1].bm_type != BMT_FILE) { 1007 return; 1008 } 1009 1010 if (modules[0].bm_hash != NULL || 1011 modules_used > 1 && modules[1].bm_hash != NULL) { 1012 return; 1013 } 1014 1015 modules[0].bm_type = BMT_ROOTFS; 1016 if (modules_used > 1) { 1017 modules[1].bm_type = BMT_HASH; 1018 modules[1].bm_name = modules[0].bm_name; 1019 } 1020 } 1021 1022 /* 1023 * For modules that do not have assigned hashes but have a separate hash module, 1024 * find the assigned hash module and set the primary module's bm_hash to point 1025 * to the hash data from that module. We will then ignore modules of type 1026 * BMT_HASH from this point forward. 1027 */ 1028 static void 1029 assign_module_hashes(void) 1030 { 1031 uint_t i, j; 1032 1033 for (i = 0; i < modules_used; i++) { 1034 if (modules[i].bm_type == BMT_HASH || 1035 modules[i].bm_hash != NULL) { 1036 continue; 1037 } 1038 1039 for (j = 0; j < modules_used; j++) { 1040 if (modules[j].bm_type != BMT_HASH || 1041 strcmp((char *)(uintptr_t)modules[j].bm_name, 1042 (char *)(uintptr_t)modules[i].bm_name) != 0) { 1043 continue; 1044 } 1045 1046 if (modules[j].bm_size < SHA1_ASCII_LENGTH) { 1047 dboot_printf("Short hash module of length " 1048 "0x%lx bytes; ignoring\n", 1049 (ulong_t)modules[j].bm_size); 1050 } else { 1051 modules[i].bm_hash = modules[j].bm_addr; 1052 } 1053 break; 1054 } 1055 } 1056 } 1057 1058 /* 1059 * During memory allocation, find the highest address not used yet. 1060 */ 1061 static void 1062 check_higher(paddr_t a) 1063 { 1064 if (a < next_avail_addr) 1065 return; 1066 next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE); 1067 DBG(next_avail_addr); 1068 } 1069 1070 /* 1071 * Walk through the module information finding the last used address. 1072 * The first available address will become the top level page table. 1073 * 1074 * We then build the phys_install memlist from the multiboot information. 1075 */ 1076 static void 1077 init_mem_alloc(void) 1078 { 1079 mb_memory_map_t *mmap; 1080 mb_module_t *mod; 1081 uint64_t start; 1082 uint64_t end; 1083 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */ 1084 extern char _end[]; 1085 int i; 1086 1087 DBG_MSG("Entered init_mem_alloc()\n"); 1088 DBG((uintptr_t)mb_info); 1089 1090 if (mb_info->mods_count > MAX_BOOT_MODULES) { 1091 dboot_panic("Too many modules (%d) -- the maximum is %d.", 1092 mb_info->mods_count, MAX_BOOT_MODULES); 1093 } 1094 /* 1095 * search the modules to find the last used address 1096 * we'll build the module list while we're walking through here 1097 */ 1098 DBG_MSG("\nFinding Modules\n"); 1099 check_higher((paddr_t)(uintptr_t)&_end); 1100 for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0; 1101 i < mb_info->mods_count; 1102 ++mod, ++i) { 1103 process_module(mod); 1104 check_higher(mod->mod_end); 1105 } 1106 bi->bi_modules = (native_ptr_t)(uintptr_t)modules; 1107 DBG(bi->bi_modules); 1108 bi->bi_module_cnt = mb_info->mods_count; 1109 DBG(bi->bi_module_cnt); 1110 1111 fixup_modules(); 1112 assign_module_hashes(); 1113 check_images(); 1114 1115 /* 1116 * Walk through the memory map from multiboot and build our memlist 1117 * structures. Note these will have native format pointers. 1118 */ 1119 DBG_MSG("\nFinding Memory Map\n"); 1120 DBG(mb_info->flags); 1121 max_mem = 0; 1122 if (mb_info->flags & 0x40) { 1123 int cnt = 0; 1124 1125 DBG(mb_info->mmap_addr); 1126 DBG(mb_info->mmap_length); 1127 check_higher(mb_info->mmap_addr + mb_info->mmap_length); 1128 1129 for (mmap = (mb_memory_map_t *)mb_info->mmap_addr; 1130 (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length; 1131 mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size 1132 + sizeof (mmap->size))) { 1133 ++cnt; 1134 start = ((uint64_t)mmap->base_addr_high << 32) + 1135 mmap->base_addr_low; 1136 end = start + ((uint64_t)mmap->length_high << 32) + 1137 mmap->length_low; 1138 1139 if (prom_debug) 1140 dboot_printf("\ttype: %d %" PRIx64 "..%" 1141 PRIx64 "\n", mmap->type, start, end); 1142 1143 /* 1144 * page align start and end 1145 */ 1146 start = (start + page_offset) & ~page_offset; 1147 end &= ~page_offset; 1148 if (end <= start) 1149 continue; 1150 1151 /* 1152 * only type 1 is usable RAM 1153 */ 1154 switch (mmap->type) { 1155 case 1: 1156 if (end > max_mem) 1157 max_mem = end; 1158 memlists[memlists_used].addr = start; 1159 memlists[memlists_used].size = end - start; 1160 ++memlists_used; 1161 if (memlists_used > MAX_MEMLIST) 1162 dboot_panic("too many memlists"); 1163 break; 1164 case 2: 1165 rsvdmemlists[rsvdmemlists_used].addr = start; 1166 rsvdmemlists[rsvdmemlists_used].size = 1167 end - start; 1168 ++rsvdmemlists_used; 1169 if (rsvdmemlists_used > MAX_MEMLIST) 1170 dboot_panic("too many rsvdmemlists"); 1171 break; 1172 default: 1173 continue; 1174 } 1175 } 1176 build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt); 1177 } else if (mb_info->flags & 0x01) { 1178 DBG(mb_info->mem_lower); 1179 memlists[memlists_used].addr = 0; 1180 memlists[memlists_used].size = mb_info->mem_lower * 1024; 1181 ++memlists_used; 1182 DBG(mb_info->mem_upper); 1183 memlists[memlists_used].addr = 1024 * 1024; 1184 memlists[memlists_used].size = mb_info->mem_upper * 1024; 1185 ++memlists_used; 1186 1187 /* 1188 * Old platform - assume I/O space at the end of memory. 1189 */ 1190 pcimemlists[0].addr = 1191 (mb_info->mem_upper * 1024) + (1024 * 1024); 1192 pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr; 1193 pcimemlists[0].next = 0; 1194 pcimemlists[0].prev = 0; 1195 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists; 1196 DBG(bi->bi_pcimem); 1197 } else { 1198 dboot_panic("No memory info from boot loader!!!"); 1199 } 1200 1201 check_higher(bi->bi_cmdline); 1202 1203 /* 1204 * finish processing the physinstall list 1205 */ 1206 sort_physinstall(); 1207 1208 /* 1209 * build bios reserved mem lists 1210 */ 1211 build_rsvdmemlists(); 1212 } 1213 #endif /* !__xpv */ 1214 1215 /* 1216 * Simple memory allocator, allocates aligned physical memory. 1217 * Note that startup_kernel() only allocates memory, never frees. 1218 * Memory usage just grows in an upward direction. 1219 */ 1220 static void * 1221 do_mem_alloc(uint32_t size, uint32_t align) 1222 { 1223 uint_t i; 1224 uint64_t best; 1225 uint64_t start; 1226 uint64_t end; 1227 1228 /* 1229 * make sure size is a multiple of pagesize 1230 */ 1231 size = RNDUP(size, MMU_PAGESIZE); 1232 next_avail_addr = RNDUP(next_avail_addr, align); 1233 1234 /* 1235 * XXPV fixme joe 1236 * 1237 * a really large bootarchive that causes you to run out of memory 1238 * may cause this to blow up 1239 */ 1240 /* LINTED E_UNEXPECTED_UINT_PROMOTION */ 1241 best = (uint64_t)-size; 1242 for (i = 0; i < memlists_used; ++i) { 1243 start = memlists[i].addr; 1244 #if defined(__xpv) 1245 start += mfn_base; 1246 #endif 1247 end = start + memlists[i].size; 1248 1249 /* 1250 * did we find the desired address? 1251 */ 1252 if (start <= next_avail_addr && next_avail_addr + size <= end) { 1253 best = next_avail_addr; 1254 goto done; 1255 } 1256 1257 /* 1258 * if not is this address the best so far? 1259 */ 1260 if (start > next_avail_addr && start < best && 1261 RNDUP(start, align) + size <= end) 1262 best = RNDUP(start, align); 1263 } 1264 1265 /* 1266 * We didn't find exactly the address we wanted, due to going off the 1267 * end of a memory region. Return the best found memory address. 1268 */ 1269 done: 1270 next_avail_addr = best + size; 1271 #if defined(__xpv) 1272 if (next_avail_addr > scratch_end) 1273 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: " 1274 "0x%lx", (ulong_t)next_avail_addr, 1275 (ulong_t)scratch_end); 1276 #endif 1277 (void) memset((void *)(uintptr_t)best, 0, size); 1278 return ((void *)(uintptr_t)best); 1279 } 1280 1281 void * 1282 mem_alloc(uint32_t size) 1283 { 1284 return (do_mem_alloc(size, MMU_PAGESIZE)); 1285 } 1286 1287 1288 /* 1289 * Build page tables to map all of memory used so far as well as the kernel. 1290 */ 1291 static void 1292 build_page_tables(void) 1293 { 1294 uint32_t psize; 1295 uint32_t level; 1296 uint32_t off; 1297 uint64_t start; 1298 #if !defined(__xpv) 1299 uint32_t i; 1300 uint64_t end; 1301 #endif /* __xpv */ 1302 1303 /* 1304 * If we're on metal, we need to create the top level pagetable. 1305 */ 1306 #if defined(__xpv) 1307 top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base; 1308 #else /* __xpv */ 1309 top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); 1310 #endif /* __xpv */ 1311 DBG((uintptr_t)top_page_table); 1312 1313 /* 1314 * Determine if we'll use large mappings for kernel, then map it. 1315 */ 1316 if (largepage_support) { 1317 psize = lpagesize; 1318 level = 1; 1319 } else { 1320 psize = MMU_PAGESIZE; 1321 level = 0; 1322 } 1323 1324 DBG_MSG("Mapping kernel\n"); 1325 DBG(ktext_phys); 1326 DBG(target_kernel_text); 1327 DBG(ksize); 1328 DBG(psize); 1329 for (off = 0; off < ksize; off += psize) 1330 map_pa_at_va(ktext_phys + off, target_kernel_text + off, level); 1331 1332 /* 1333 * The kernel will need a 1 page window to work with page tables 1334 */ 1335 bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE); 1336 DBG(bi->bi_pt_window); 1337 bi->bi_pte_to_pt_window = 1338 (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0); 1339 DBG(bi->bi_pte_to_pt_window); 1340 1341 #if defined(__xpv) 1342 if (!DOMAIN_IS_INITDOMAIN(xen_info)) { 1343 /* If this is a domU we're done. */ 1344 DBG_MSG("\nPage tables constructed\n"); 1345 return; 1346 } 1347 #endif /* __xpv */ 1348 1349 /* 1350 * We need 1:1 mappings for the lower 1M of memory to access 1351 * BIOS tables used by a couple of drivers during boot. 1352 * 1353 * The following code works because our simple memory allocator 1354 * only grows usage in an upwards direction. 1355 * 1356 * Note that by this point in boot some mappings for low memory 1357 * may already exist because we've already accessed device in low 1358 * memory. (Specifically the video frame buffer and keyboard 1359 * status ports.) If we're booting on raw hardware then GRUB 1360 * created these mappings for us. If we're booting under a 1361 * hypervisor then we went ahead and remapped these devices into 1362 * memory allocated within dboot itself. 1363 */ 1364 if (map_debug) 1365 dboot_printf("1:1 map pa=0..1Meg\n"); 1366 for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) { 1367 #if defined(__xpv) 1368 map_ma_at_va(start, start, 0); 1369 #else /* __xpv */ 1370 map_pa_at_va(start, start, 0); 1371 #endif /* __xpv */ 1372 } 1373 1374 #if !defined(__xpv) 1375 for (i = 0; i < memlists_used; ++i) { 1376 start = memlists[i].addr; 1377 1378 end = start + memlists[i].size; 1379 1380 if (map_debug) 1381 dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n", 1382 start, end); 1383 while (start < end && start < next_avail_addr) { 1384 map_pa_at_va(start, start, 0); 1385 start += MMU_PAGESIZE; 1386 } 1387 } 1388 #endif /* !__xpv */ 1389 1390 DBG_MSG("\nPage tables constructed\n"); 1391 } 1392 1393 #define NO_MULTIBOOT \ 1394 "multiboot is no longer used to boot the Solaris Operating System.\n\ 1395 The grub entry should be changed to:\n\ 1396 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\ 1397 module$ /platform/i86pc/$ISADIR/boot_archive\n\ 1398 See http://illumos.org/msg/SUNOS-8000-AK for details.\n" 1399 1400 /* 1401 * startup_kernel has a pretty simple job. It builds pagetables which reflect 1402 * 1:1 mappings for all memory in use. It then also adds mappings for 1403 * the kernel nucleus at virtual address of target_kernel_text using large page 1404 * mappings. The page table pages are also accessible at 1:1 mapped 1405 * virtual addresses. 1406 */ 1407 /*ARGSUSED*/ 1408 void 1409 startup_kernel(void) 1410 { 1411 char *cmdline; 1412 uintptr_t addr; 1413 #if defined(__xpv) 1414 physdev_set_iopl_t set_iopl; 1415 #endif /* __xpv */ 1416 1417 /* 1418 * At this point we are executing in a 32 bit real mode. 1419 */ 1420 #if defined(__xpv) 1421 cmdline = (char *)xen_info->cmd_line; 1422 #else /* __xpv */ 1423 cmdline = (char *)mb_info->cmdline; 1424 #endif /* __xpv */ 1425 1426 prom_debug = (strstr(cmdline, "prom_debug") != NULL); 1427 map_debug = (strstr(cmdline, "map_debug") != NULL); 1428 1429 #if defined(__xpv) 1430 /* 1431 * For dom0, before we initialize the console subsystem we'll 1432 * need to enable io operations, so set I/O priveldge level to 1. 1433 */ 1434 if (DOMAIN_IS_INITDOMAIN(xen_info)) { 1435 set_iopl.iopl = 1; 1436 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 1437 } 1438 #endif /* __xpv */ 1439 1440 bcons_init(cmdline); 1441 DBG_MSG("\n\nSolaris prekernel set: "); 1442 DBG_MSG(cmdline); 1443 DBG_MSG("\n"); 1444 1445 if (strstr(cmdline, "multiboot") != NULL) { 1446 dboot_panic(NO_MULTIBOOT); 1447 } 1448 1449 /* 1450 * boot info must be 16 byte aligned for 64 bit kernel ABI 1451 */ 1452 addr = (uintptr_t)boot_info; 1453 addr = (addr + 0xf) & ~0xf; 1454 bi = (struct xboot_info *)addr; 1455 DBG((uintptr_t)bi); 1456 bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline; 1457 1458 /* 1459 * Need correct target_kernel_text value 1460 */ 1461 #if defined(_BOOT_TARGET_amd64) 1462 target_kernel_text = KERNEL_TEXT_amd64; 1463 #elif defined(__xpv) 1464 target_kernel_text = KERNEL_TEXT_i386_xpv; 1465 #else 1466 target_kernel_text = KERNEL_TEXT_i386; 1467 #endif 1468 DBG(target_kernel_text); 1469 1470 #if defined(__xpv) 1471 1472 /* 1473 * XXPV Derive this stuff from CPUID / what the hypervisor has enabled 1474 */ 1475 1476 #if defined(_BOOT_TARGET_amd64) 1477 /* 1478 * 64-bit hypervisor. 1479 */ 1480 amd64_support = 1; 1481 pae_support = 1; 1482 1483 #else /* _BOOT_TARGET_amd64 */ 1484 1485 /* 1486 * See if we are running on a PAE Hypervisor 1487 */ 1488 { 1489 xen_capabilities_info_t caps; 1490 1491 if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0) 1492 dboot_panic("HYPERVISOR_xen_version(caps) failed"); 1493 caps[sizeof (caps) - 1] = 0; 1494 if (prom_debug) 1495 dboot_printf("xen capabilities %s\n", caps); 1496 if (strstr(caps, "x86_32p") != NULL) 1497 pae_support = 1; 1498 } 1499 1500 #endif /* _BOOT_TARGET_amd64 */ 1501 { 1502 xen_platform_parameters_t p; 1503 1504 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0) 1505 dboot_panic("HYPERVISOR_xen_version(parms) failed"); 1506 DBG(p.virt_start); 1507 mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start); 1508 } 1509 1510 /* 1511 * The hypervisor loads stuff starting at 1Gig 1512 */ 1513 mfn_base = ONE_GIG; 1514 DBG(mfn_base); 1515 1516 /* 1517 * enable writable page table mode for the hypervisor 1518 */ 1519 if (HYPERVISOR_vm_assist(VMASST_CMD_enable, 1520 VMASST_TYPE_writable_pagetables) < 0) 1521 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed"); 1522 1523 /* 1524 * check for NX support 1525 */ 1526 if (pae_support) { 1527 uint32_t eax = 0x80000000; 1528 uint32_t edx = get_cpuid_edx(&eax); 1529 1530 if (eax >= 0x80000001) { 1531 eax = 0x80000001; 1532 edx = get_cpuid_edx(&eax); 1533 if (edx & CPUID_AMD_EDX_NX) 1534 NX_support = 1; 1535 } 1536 } 1537 1538 #if !defined(_BOOT_TARGET_amd64) 1539 1540 /* 1541 * The 32-bit hypervisor uses segmentation to protect itself from 1542 * guests. This means when a guest attempts to install a flat 4GB 1543 * code or data descriptor the 32-bit hypervisor will protect itself 1544 * by silently shrinking the segment such that if the guest attempts 1545 * any access where the hypervisor lives a #gp fault is generated. 1546 * The problem is that some applications expect a full 4GB flat 1547 * segment for their current thread pointer and will use negative 1548 * offset segment wrap around to access data. TLS support in linux 1549 * brand is one example of this. 1550 * 1551 * The 32-bit hypervisor can catch the #gp fault in these cases 1552 * and emulate the access without passing the #gp fault to the guest 1553 * but only if VMASST_TYPE_4gb_segments is explicitly turned on. 1554 * Seems like this should have been the default. 1555 * Either way, we want the hypervisor -- and not Solaris -- to deal 1556 * to deal with emulating these accesses. 1557 */ 1558 if (HYPERVISOR_vm_assist(VMASST_CMD_enable, 1559 VMASST_TYPE_4gb_segments) < 0) 1560 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed"); 1561 #endif /* !_BOOT_TARGET_amd64 */ 1562 1563 #else /* __xpv */ 1564 1565 /* 1566 * use cpuid to enable MMU features 1567 */ 1568 if (have_cpuid()) { 1569 uint32_t eax, edx; 1570 1571 eax = 1; 1572 edx = get_cpuid_edx(&eax); 1573 if (edx & CPUID_INTC_EDX_PSE) 1574 largepage_support = 1; 1575 if (edx & CPUID_INTC_EDX_PGE) 1576 pge_support = 1; 1577 if (edx & CPUID_INTC_EDX_PAE) 1578 pae_support = 1; 1579 1580 eax = 0x80000000; 1581 edx = get_cpuid_edx(&eax); 1582 if (eax >= 0x80000001) { 1583 eax = 0x80000001; 1584 edx = get_cpuid_edx(&eax); 1585 if (edx & CPUID_AMD_EDX_LM) 1586 amd64_support = 1; 1587 if (edx & CPUID_AMD_EDX_NX) 1588 NX_support = 1; 1589 } 1590 } else { 1591 dboot_printf("cpuid not supported\n"); 1592 } 1593 #endif /* __xpv */ 1594 1595 1596 #if defined(_BOOT_TARGET_amd64) 1597 if (amd64_support == 0) 1598 dboot_panic("long mode not supported, rebooting"); 1599 else if (pae_support == 0) 1600 dboot_panic("long mode, but no PAE; rebooting"); 1601 #else 1602 /* 1603 * Allow the command line to over-ride use of PAE for 32 bit. 1604 */ 1605 if (strstr(cmdline, "disablePAE=true") != NULL) { 1606 pae_support = 0; 1607 NX_support = 0; 1608 amd64_support = 0; 1609 } 1610 #endif 1611 1612 /* 1613 * initialize the simple memory allocator 1614 */ 1615 init_mem_alloc(); 1616 1617 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64) 1618 /* 1619 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory 1620 */ 1621 if (max_mem < FOUR_GIG && NX_support == 0) 1622 pae_support = 0; 1623 #endif 1624 1625 /* 1626 * configure mmu information 1627 */ 1628 if (pae_support) { 1629 shift_amt = shift_amt_pae; 1630 ptes_per_table = 512; 1631 pte_size = 8; 1632 lpagesize = TWO_MEG; 1633 #if defined(_BOOT_TARGET_amd64) 1634 top_level = 3; 1635 #else 1636 top_level = 2; 1637 #endif 1638 } else { 1639 pae_support = 0; 1640 NX_support = 0; 1641 shift_amt = shift_amt_nopae; 1642 ptes_per_table = 1024; 1643 pte_size = 4; 1644 lpagesize = FOUR_MEG; 1645 top_level = 1; 1646 } 1647 1648 DBG(pge_support); 1649 DBG(NX_support); 1650 DBG(largepage_support); 1651 DBG(amd64_support); 1652 DBG(top_level); 1653 DBG(pte_size); 1654 DBG(ptes_per_table); 1655 DBG(lpagesize); 1656 1657 #if defined(__xpv) 1658 ktext_phys = ONE_GIG; /* from UNIX Mapfile */ 1659 #else 1660 ktext_phys = FOUR_MEG; /* from UNIX Mapfile */ 1661 #endif 1662 1663 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64) 1664 /* 1665 * For grub, copy kernel bits from the ELF64 file to final place. 1666 */ 1667 DBG_MSG("\nAllocating nucleus pages.\n"); 1668 ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG); 1669 if (ktext_phys == 0) 1670 dboot_panic("failed to allocate aligned kernel memory"); 1671 if (dboot_elfload64(mb_header.load_addr) != 0) 1672 dboot_panic("failed to parse kernel ELF image, rebooting"); 1673 #endif 1674 1675 DBG(ktext_phys); 1676 1677 /* 1678 * Allocate page tables. 1679 */ 1680 build_page_tables(); 1681 1682 /* 1683 * return to assembly code to switch to running kernel 1684 */ 1685 entry_addr_low = (uint32_t)target_kernel_text; 1686 DBG(entry_addr_low); 1687 bi->bi_use_largepage = largepage_support; 1688 bi->bi_use_pae = pae_support; 1689 bi->bi_use_pge = pge_support; 1690 bi->bi_use_nx = NX_support; 1691 1692 #if defined(__xpv) 1693 1694 bi->bi_next_paddr = next_avail_addr - mfn_base; 1695 DBG(bi->bi_next_paddr); 1696 bi->bi_next_vaddr = (native_ptr_t)next_avail_addr; 1697 DBG(bi->bi_next_vaddr); 1698 1699 /* 1700 * unmap unused pages in start area to make them available for DMA 1701 */ 1702 while (next_avail_addr < scratch_end) { 1703 (void) HYPERVISOR_update_va_mapping(next_avail_addr, 1704 0, UVMF_INVLPG | UVMF_LOCAL); 1705 next_avail_addr += MMU_PAGESIZE; 1706 } 1707 1708 bi->bi_xen_start_info = (uintptr_t)xen_info; 1709 DBG((uintptr_t)HYPERVISOR_shared_info); 1710 bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info; 1711 bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base; 1712 1713 #else /* __xpv */ 1714 1715 bi->bi_next_paddr = next_avail_addr; 1716 DBG(bi->bi_next_paddr); 1717 bi->bi_next_vaddr = (uintptr_t)next_avail_addr; 1718 DBG(bi->bi_next_vaddr); 1719 bi->bi_mb_info = (uintptr_t)mb_info; 1720 bi->bi_top_page_table = (uintptr_t)top_page_table; 1721 1722 #endif /* __xpv */ 1723 1724 bi->bi_kseg_size = FOUR_MEG; 1725 DBG(bi->bi_kseg_size); 1726 1727 #ifndef __xpv 1728 if (map_debug) 1729 dump_tables(); 1730 #endif 1731 1732 DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n"); 1733 } 1734