1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/machparam.h> 31 #include <sys/x86_archext.h> 32 #include <sys/systm.h> 33 #include <sys/mach_mmu.h> 34 #include <sys/multiboot.h> 35 36 #if defined(__xpv) 37 38 #include <sys/hypervisor.h> 39 uintptr_t xen_virt_start; 40 pfn_t *mfn_to_pfn_mapping; 41 42 #else /* !__xpv */ 43 44 extern multiboot_header_t mb_header; 45 extern int have_cpuid(void); 46 47 #endif /* !__xpv */ 48 49 #include <sys/inttypes.h> 50 #include <sys/bootinfo.h> 51 #include <sys/mach_mmu.h> 52 #include <sys/boot_console.h> 53 54 #include "dboot_asm.h" 55 #include "dboot_printf.h" 56 #include "dboot_xboot.h" 57 #include "dboot_elfload.h" 58 59 /* 60 * This file contains code that runs to transition us from either a multiboot 61 * compliant loader (32 bit non-paging) or a XPV domain loader to 62 * regular kernel execution. Its task is to setup the kernel memory image 63 * and page tables. 64 * 65 * The code executes as: 66 * - 32 bits under GRUB (for 32 or 64 bit Solaris) 67 * - a 32 bit program for the 32-bit PV hypervisor 68 * - a 64 bit program for the 64-bit PV hypervisor (at least for now) 69 * 70 * Under the PV hypervisor, we must create mappings for any memory beyond the 71 * initial start of day allocation (such as the kernel itself). 72 * 73 * When on the metal, the mapping between maddr_t and paddr_t is 1:1. 74 * Since we are running in real mode, so all such memory is accessible. 75 */ 76 77 /* 78 * Standard bits used in PTE (page level) and PTP (internal levels) 79 */ 80 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER; 81 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST; 82 83 /* 84 * This is the target addresses (physical) where the kernel text and data 85 * nucleus pages will be unpacked. On the hypervisor this is actually a 86 * virtual address. 87 */ 88 paddr_t ktext_phys; 89 uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */ 90 91 static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */ 92 93 /* 94 * The stack is setup in assembler before entering startup_kernel() 95 */ 96 char stack_space[STACK_SIZE]; 97 98 /* 99 * Used to track physical memory allocation 100 */ 101 static paddr_t next_avail_addr = 0; 102 103 #if defined(__xpv) 104 /* 105 * Additional information needed for hypervisor memory allocation. 106 * Only memory up to scratch_end is mapped by page tables. 107 * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so 108 * to derive a pfn from a pointer, you subtract mfn_base. 109 */ 110 111 static paddr_t scratch_end = 0; /* we can't write all of mem here */ 112 static paddr_t mfn_base; /* addr corresponding to mfn_list[0] */ 113 start_info_t *xen_info; 114 115 #else /* __xpv */ 116 117 /* 118 * If on the metal, then we have a multiboot loader. 119 */ 120 multiboot_info_t *mb_info; 121 122 #endif /* __xpv */ 123 124 /* 125 * This contains information passed to the kernel 126 */ 127 struct xboot_info boot_info[2]; /* extra space to fix alignement for amd64 */ 128 struct xboot_info *bi; 129 130 /* 131 * Page table and memory stuff. 132 */ 133 static paddr_t max_mem; /* maximum memory address */ 134 135 /* 136 * Information about processor MMU 137 */ 138 int amd64_support = 0; 139 int largepage_support = 0; 140 int pae_support = 0; 141 int pge_support = 0; 142 int NX_support = 0; 143 144 /* 145 * Low 32 bits of kernel entry address passed back to assembler. 146 * When running a 64 bit kernel, the high 32 bits are 0xffffffff. 147 */ 148 uint32_t entry_addr_low; 149 150 /* 151 * Memlists for the kernel. We shouldn't need a lot of these. 152 */ 153 #define MAX_MEMLIST (50) 154 struct boot_memlist memlists[MAX_MEMLIST]; 155 uint_t memlists_used = 0; 156 struct boot_memlist pcimemlists[MAX_MEMLIST]; 157 uint_t pcimemlists_used = 0; 158 159 #define MAX_MODULES (10) 160 struct boot_modules modules[MAX_MODULES]; 161 uint_t modules_used = 0; 162 163 /* 164 * Debugging macros 165 */ 166 uint_t prom_debug = 0; 167 uint_t map_debug = 0; 168 169 /* 170 * Either hypervisor-specific or grub-specific code builds the initial 171 * memlists. This code does the sort/merge/link for final use. 172 */ 173 static void 174 sort_physinstall(void) 175 { 176 int i; 177 #if !defined(__xpv) 178 int j; 179 struct boot_memlist tmp; 180 181 /* 182 * Now sort the memlists, in case they weren't in order. 183 * Yeah, this is a bubble sort; small, simple and easy to get right. 184 */ 185 DBG_MSG("Sorting phys-installed list\n"); 186 for (j = memlists_used - 1; j > 0; --j) { 187 for (i = 0; i < j; ++i) { 188 if (memlists[i].addr < memlists[i + 1].addr) 189 continue; 190 tmp = memlists[i]; 191 memlists[i] = memlists[i + 1]; 192 memlists[i + 1] = tmp; 193 } 194 } 195 196 /* 197 * Merge any memlists that don't have holes between them. 198 */ 199 for (i = 0; i <= memlists_used - 1; ++i) { 200 if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr) 201 continue; 202 203 if (prom_debug) 204 dboot_printf( 205 "merging mem segs %" PRIx64 "...%" PRIx64 206 " w/ %" PRIx64 "...%" PRIx64 "\n", 207 memlists[i].addr, 208 memlists[i].addr + memlists[i].size, 209 memlists[i + 1].addr, 210 memlists[i + 1].addr + memlists[i + 1].size); 211 212 memlists[i].size += memlists[i + 1].size; 213 for (j = i + 1; j < memlists_used - 1; ++j) 214 memlists[j] = memlists[j + 1]; 215 --memlists_used; 216 DBG(memlists_used); 217 --i; /* after merging we need to reexamine, so do this */ 218 } 219 #endif /* __xpv */ 220 221 if (prom_debug) { 222 dboot_printf("\nFinal memlists:\n"); 223 for (i = 0; i < memlists_used; ++i) { 224 dboot_printf("\t%d: addr=%" PRIx64 " size=%" 225 PRIx64 "\n", i, memlists[i].addr, memlists[i].size); 226 } 227 } 228 229 /* 230 * link together the memlists with native size pointers 231 */ 232 memlists[0].next = 0; 233 memlists[0].prev = 0; 234 for (i = 1; i < memlists_used; ++i) { 235 memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1); 236 memlists[i].next = 0; 237 memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i); 238 } 239 bi->bi_phys_install = (native_ptr_t)memlists; 240 DBG(bi->bi_phys_install); 241 } 242 243 #if defined(__xpv) 244 245 /* 246 * halt on the hypervisor after a delay to drain console output 247 */ 248 void 249 dboot_halt(void) 250 { 251 uint_t i = 10000; 252 253 while (--i) 254 HYPERVISOR_yield(); 255 HYPERVISOR_shutdown(SHUTDOWN_poweroff); 256 } 257 258 /* 259 * From a machine address, find the corresponding pseudo-physical address. 260 * Pseudo-physical address are contiguous and run from mfn_base in each VM. 261 * Machine addresses are the real underlying hardware addresses. 262 * These are needed for page table entries. Note that this routine is 263 * poorly protected. A bad value of "ma" will cause a page fault. 264 */ 265 paddr_t 266 ma_to_pa(maddr_t ma) 267 { 268 ulong_t pgoff = ma & MMU_PAGEOFFSET; 269 ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)]; 270 paddr_t pa; 271 272 if (pfn >= xen_info->nr_pages) 273 return (-(paddr_t)1); 274 pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff; 275 #ifdef DEBUG 276 if (ma != pa_to_ma(pa)) 277 dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", " 278 "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa)); 279 #endif 280 return (pa); 281 } 282 283 /* 284 * From a pseudo-physical address, find the corresponding machine address. 285 */ 286 maddr_t 287 pa_to_ma(paddr_t pa) 288 { 289 pfn_t pfn; 290 ulong_t mfn; 291 292 pfn = mmu_btop(pa - mfn_base); 293 if (pa < mfn_base || pfn >= xen_info->nr_pages) 294 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa); 295 mfn = ((ulong_t *)xen_info->mfn_list)[pfn]; 296 #ifdef DEBUG 297 if (mfn_to_pfn_mapping[mfn] != pfn) 298 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n", 299 pfn, mfn, mfn_to_pfn_mapping[mfn]); 300 #endif 301 return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET)); 302 } 303 304 #endif /* __xpv */ 305 306 x86pte_t 307 get_pteval(paddr_t table, uint_t index) 308 { 309 if (pae_support) 310 return (((x86pte_t *)(uintptr_t)table)[index]); 311 return (((x86pte32_t *)(uintptr_t)table)[index]); 312 } 313 314 /*ARGSUSED*/ 315 void 316 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval) 317 { 318 #ifdef __xpv 319 mmu_update_t t; 320 maddr_t mtable = pa_to_ma(table); 321 int retcnt; 322 323 t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE; 324 t.val = pteval; 325 if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1) 326 dboot_panic("HYPERVISOR_mmu_update() failed"); 327 #else /* __xpv */ 328 uintptr_t tab_addr = (uintptr_t)table; 329 330 if (pae_support) 331 ((x86pte_t *)tab_addr)[index] = pteval; 332 else 333 ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval; 334 if (level == top_level && level == 2) 335 reload_cr3(); 336 #endif /* __xpv */ 337 } 338 339 paddr_t 340 make_ptable(x86pte_t *pteval, uint_t level) 341 { 342 paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); 343 344 if (level == top_level && level == 2) 345 *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID; 346 else 347 *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits; 348 349 #ifdef __xpv 350 /* Remove write permission to the new page table. */ 351 if (HYPERVISOR_update_va_mapping(new_table, 352 *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL)) 353 dboot_panic("HYP_update_va_mapping error"); 354 #endif 355 356 if (map_debug) 357 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%" 358 PRIx64 "\n", level, (ulong_t)new_table, *pteval); 359 return (new_table); 360 } 361 362 x86pte_t * 363 map_pte(paddr_t table, uint_t index) 364 { 365 return ((x86pte_t *)(uintptr_t)(table + index * pte_size)); 366 } 367 368 #if !defined(__xpv) 369 #define maddr_t paddr_t 370 #endif /* !__xpv */ 371 372 /* 373 * Add a mapping for the machine page at the given virtual address. 374 */ 375 static void 376 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level) 377 { 378 x86pte_t *ptep; 379 x86pte_t pteval; 380 381 pteval = ma | pte_bits; 382 if (level > 0) 383 pteval |= PT_PAGESIZE; 384 if (va >= target_kernel_text && pge_support) 385 pteval |= PT_GLOBAL; 386 387 if (map_debug && ma != va) 388 dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64 389 " pte=0x%" PRIx64 " l=%d\n", 390 (uint64_t)ma, (uint64_t)va, pteval, level); 391 392 #if defined(__xpv) 393 /* 394 * see if we can avoid find_pte() on the hypervisor 395 */ 396 if (HYPERVISOR_update_va_mapping(va, pteval, 397 UVMF_INVLPG | UVMF_LOCAL) == 0) 398 return; 399 #endif 400 401 /* 402 * Find the pte that will map this address. This creates any 403 * missing intermediate level page tables 404 */ 405 ptep = find_pte(va, NULL, level, 0); 406 407 /* 408 * When paravirtualized, we must use hypervisor calls to modify the 409 * PTE, since paging is active. On real hardware we just write to 410 * the pagetables which aren't in use yet. 411 */ 412 #if defined(__xpv) 413 ptep = ptep; /* shut lint up */ 414 if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL)) 415 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64 416 " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "", 417 (uint64_t)va, level, (uint64_t)ma, pteval); 418 #else 419 if (va < 1024 * 1024) 420 pteval |= PT_NOCACHE; /* for video RAM */ 421 if (pae_support) 422 *ptep = pteval; 423 else 424 *((x86pte32_t *)ptep) = (x86pte32_t)pteval; 425 #endif 426 } 427 428 /* 429 * Add a mapping for the physical page at the given virtual address. 430 */ 431 static void 432 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level) 433 { 434 map_ma_at_va(pa_to_ma(pa), va, level); 435 } 436 437 /* 438 * This is called to remove start..end from the 439 * possible range of PCI addresses. 440 */ 441 const uint64_t pci_lo_limit = 0x00100000ul; 442 const uint64_t pci_hi_limit = 0xfff00000ul; 443 static void 444 exclude_from_pci(uint64_t start, uint64_t end) 445 { 446 int i; 447 int j; 448 struct boot_memlist *ml; 449 450 for (i = 0; i < pcimemlists_used; ++i) { 451 ml = &pcimemlists[i]; 452 453 /* delete the entire range? */ 454 if (start <= ml->addr && ml->addr + ml->size <= end) { 455 --pcimemlists_used; 456 for (j = i; j < pcimemlists_used; ++j) 457 pcimemlists[j] = pcimemlists[j + 1]; 458 --i; /* to revisit the new one at this index */ 459 } 460 461 /* split a range? */ 462 else if (ml->addr < start && end < ml->addr + ml->size) { 463 464 ++pcimemlists_used; 465 if (pcimemlists_used > MAX_MEMLIST) 466 dboot_panic("too many pcimemlists"); 467 468 for (j = pcimemlists_used - 1; j > i; --j) 469 pcimemlists[j] = pcimemlists[j - 1]; 470 ml->size = start - ml->addr; 471 472 ++ml; 473 ml->size = (ml->addr + ml->size) - end; 474 ml->addr = end; 475 ++i; /* skip on to next one */ 476 } 477 478 /* cut memory off the start? */ 479 else if (ml->addr < end && end < ml->addr + ml->size) { 480 ml->size -= end - ml->addr; 481 ml->addr = end; 482 } 483 484 /* cut memory off the end? */ 485 else if (ml->addr <= start && start < ml->addr + ml->size) { 486 ml->size = start - ml->addr; 487 } 488 } 489 } 490 491 /* 492 * Xen strips the size field out of the mb_memory_map_t, see struct e820entry 493 * definition in Xen source. 494 */ 495 #ifdef __xpv 496 typedef struct { 497 uint32_t base_addr_low; 498 uint32_t base_addr_high; 499 uint32_t length_low; 500 uint32_t length_high; 501 uint32_t type; 502 } mmap_t; 503 #else 504 typedef mb_memory_map_t mmap_t; 505 #endif 506 507 static void 508 build_pcimemlists(mmap_t *mem, int num) 509 { 510 mmap_t *mmap; 511 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */ 512 uint64_t start; 513 uint64_t end; 514 int i; 515 516 /* 517 * initialize 518 */ 519 pcimemlists[0].addr = pci_lo_limit; 520 pcimemlists[0].size = pci_hi_limit - pci_lo_limit; 521 pcimemlists_used = 1; 522 523 /* 524 * Fill in PCI memlists. 525 */ 526 for (mmap = mem, i = 0; i < num; ++i, ++mmap) { 527 start = ((uint64_t)mmap->base_addr_high << 32) + 528 mmap->base_addr_low; 529 end = start + ((uint64_t)mmap->length_high << 32) + 530 mmap->length_low; 531 532 if (prom_debug) 533 dboot_printf("\ttype: %d %" PRIx64 "..%" 534 PRIx64 "\n", mmap->type, start, end); 535 536 /* 537 * page align start and end 538 */ 539 start = (start + page_offset) & ~page_offset; 540 end &= ~page_offset; 541 if (end <= start) 542 continue; 543 544 exclude_from_pci(start, end); 545 } 546 547 /* 548 * Finish off the pcimemlist 549 */ 550 if (prom_debug) { 551 for (i = 0; i < pcimemlists_used; ++i) { 552 dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%" 553 PRIx64 "\n", pcimemlists[i].addr, 554 pcimemlists[i].addr + pcimemlists[i].size); 555 } 556 } 557 pcimemlists[0].next = 0; 558 pcimemlists[0].prev = 0; 559 for (i = 1; i < pcimemlists_used; ++i) { 560 pcimemlists[i].prev = 561 (native_ptr_t)(uintptr_t)(pcimemlists + i - 1); 562 pcimemlists[i].next = 0; 563 pcimemlists[i - 1].next = 564 (native_ptr_t)(uintptr_t)(pcimemlists + i); 565 } 566 bi->bi_pcimem = (native_ptr_t)pcimemlists; 567 DBG(bi->bi_pcimem); 568 } 569 570 #if defined(__xpv) 571 /* 572 * Initialize memory allocator stuff from hypervisor-supplied start info. 573 * 574 * There is 512KB of scratch area after the boot stack page. 575 * We'll use that for everything except the kernel nucleus pages which are too 576 * big to fit there and are allocated last anyway. 577 */ 578 #define MAXMAPS 100 579 static mmap_t map_buffer[MAXMAPS]; 580 static void 581 init_mem_alloc(void) 582 { 583 int local; /* variables needed to find start region */ 584 paddr_t scratch_start; 585 xen_memory_map_t map; 586 587 DBG_MSG("Entered init_mem_alloc()\n"); 588 589 /* 590 * Free memory follows the stack. There's at least 512KB of scratch 591 * space, rounded up to at least 2Mb alignment. That should be enough 592 * for the page tables we'll need to build. The nucleus memory is 593 * allocated last and will be outside the addressible range. We'll 594 * switch to new page tables before we unpack the kernel 595 */ 596 scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE); 597 DBG(scratch_start); 598 scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG); 599 DBG(scratch_end); 600 601 /* 602 * For paranoia, leave some space between hypervisor data and ours. 603 * Use 500 instead of 512. 604 */ 605 next_avail_addr = scratch_end - 500 * 1024; 606 DBG(next_avail_addr); 607 608 /* 609 * The domain builder gives us at most 1 module 610 */ 611 DBG(xen_info->mod_len); 612 if (xen_info->mod_len > 0) { 613 DBG(xen_info->mod_start); 614 modules[0].bm_addr = xen_info->mod_start; 615 modules[0].bm_size = xen_info->mod_len; 616 bi->bi_module_cnt = 1; 617 bi->bi_modules = (native_ptr_t)modules; 618 } else { 619 bi->bi_module_cnt = 0; 620 bi->bi_modules = NULL; 621 } 622 DBG(bi->bi_module_cnt); 623 DBG(bi->bi_modules); 624 625 DBG(xen_info->mfn_list); 626 DBG(xen_info->nr_pages); 627 max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT; 628 DBG(max_mem); 629 630 /* 631 * Using pseudo-physical addresses, so only 1 memlist element 632 */ 633 memlists[0].addr = 0; 634 DBG(memlists[0].addr); 635 memlists[0].size = max_mem; 636 DBG(memlists[0].size); 637 memlists_used = 1; 638 DBG(memlists_used); 639 640 /* 641 * finish building physinstall list 642 */ 643 sort_physinstall(); 644 645 if (DOMAIN_IS_INITDOMAIN(xen_info)) { 646 /* 647 * build PCI Memory list 648 */ 649 map.nr_entries = MAXMAPS; 650 /*LINTED: constant in conditional context*/ 651 set_xen_guest_handle(map.buffer, map_buffer); 652 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0) 653 dboot_panic("getting XENMEM_machine_memory_map failed"); 654 build_pcimemlists(map_buffer, map.nr_entries); 655 } 656 } 657 658 #else /* !__xpv */ 659 660 /* 661 * During memory allocation, find the highest address not used yet. 662 */ 663 static void 664 check_higher(paddr_t a) 665 { 666 if (a < next_avail_addr) 667 return; 668 next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE); 669 DBG(next_avail_addr); 670 } 671 672 /* 673 * Walk through the module information finding the last used address. 674 * The first available address will become the top level page table. 675 * 676 * We then build the phys_install memlist from the multiboot information. 677 */ 678 static void 679 init_mem_alloc(void) 680 { 681 mb_memory_map_t *mmap; 682 mb_module_t *mod; 683 uint64_t start; 684 uint64_t end; 685 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */ 686 extern char _end[]; 687 int i; 688 689 DBG_MSG("Entered init_mem_alloc()\n"); 690 DBG((uintptr_t)mb_info); 691 692 /* 693 * search the modules to find the last used address 694 * we'll build the module list while we're walking through here 695 */ 696 DBG_MSG("\nFinding Modules\n"); 697 check_higher((paddr_t)&_end); 698 for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0; 699 i < mb_info->mods_count; 700 ++mod, ++i) { 701 if (prom_debug) { 702 dboot_printf("\tmodule #%d: %s at: 0x%lx, len 0x%lx\n", 703 i, (char *)(mod->mod_name), 704 (ulong_t)mod->mod_start, (ulong_t)mod->mod_end); 705 } 706 modules[i].bm_addr = mod->mod_start; 707 modules[i].bm_size = mod->mod_end; 708 709 check_higher(mod->mod_end); 710 } 711 bi->bi_modules = (native_ptr_t)modules; 712 DBG(bi->bi_modules); 713 bi->bi_module_cnt = mb_info->mods_count; 714 DBG(bi->bi_module_cnt); 715 716 /* 717 * Walk through the memory map from multiboot and build our memlist 718 * structures. Note these will have native format pointers. 719 */ 720 DBG_MSG("\nFinding Memory Map\n"); 721 DBG(mb_info->flags); 722 max_mem = 0; 723 if (mb_info->flags & 0x40) { 724 int cnt = 0; 725 726 DBG(mb_info->mmap_addr); 727 DBG(mb_info->mmap_length); 728 check_higher(mb_info->mmap_addr + mb_info->mmap_length); 729 730 for (mmap = (mb_memory_map_t *)mb_info->mmap_addr; 731 (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length; 732 mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size 733 + sizeof (mmap->size))) { 734 ++cnt; 735 start = ((uint64_t)mmap->base_addr_high << 32) + 736 mmap->base_addr_low; 737 end = start + ((uint64_t)mmap->length_high << 32) + 738 mmap->length_low; 739 740 if (prom_debug) 741 dboot_printf("\ttype: %d %" PRIx64 "..%" 742 PRIx64 "\n", mmap->type, start, end); 743 744 /* 745 * page align start and end 746 */ 747 start = (start + page_offset) & ~page_offset; 748 end &= ~page_offset; 749 if (end <= start) 750 continue; 751 752 /* 753 * only type 1 is usable RAM 754 */ 755 if (mmap->type != 1) 756 continue; 757 758 if (end > max_mem) 759 max_mem = end; 760 761 memlists[memlists_used].addr = start; 762 memlists[memlists_used].size = end - start; 763 ++memlists_used; 764 if (memlists_used > MAX_MEMLIST) 765 dboot_panic("too many memlists"); 766 } 767 build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt); 768 } else if (mb_info->flags & 0x01) { 769 DBG(mb_info->mem_lower); 770 memlists[memlists_used].addr = 0; 771 memlists[memlists_used].size = mb_info->mem_lower * 1024; 772 ++memlists_used; 773 DBG(mb_info->mem_upper); 774 memlists[memlists_used].addr = 1024 * 1024; 775 memlists[memlists_used].size = mb_info->mem_upper * 1024; 776 ++memlists_used; 777 778 /* 779 * Old platform - assume I/O space at the end of memory. 780 */ 781 pcimemlists[0].addr = 782 (mb_info->mem_upper * 1024) + (1024 * 1024); 783 pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr; 784 pcimemlists[0].next = 0; 785 pcimemlists[0].prev = 0; 786 bi->bi_pcimem = (native_ptr_t)pcimemlists; 787 DBG(bi->bi_pcimem); 788 } else { 789 dboot_panic("No memory info from boot loader!!!"); 790 } 791 792 check_higher(bi->bi_cmdline); 793 794 /* 795 * finish processing the physinstall list 796 */ 797 sort_physinstall(); 798 } 799 #endif /* !__xpv */ 800 801 /* 802 * Simple memory allocator, allocates aligned physical memory. 803 * Note that startup_kernel() only allocates memory, never frees. 804 * Memory usage just grows in an upward direction. 805 */ 806 static void * 807 do_mem_alloc(uint32_t size, uint32_t align) 808 { 809 uint_t i; 810 uint64_t best; 811 uint64_t start; 812 uint64_t end; 813 814 /* 815 * make sure size is a multiple of pagesize 816 */ 817 size = RNDUP(size, MMU_PAGESIZE); 818 next_avail_addr = RNDUP(next_avail_addr, align); 819 820 /* 821 * XXPV fixme joe 822 * 823 * a really large bootarchive that causes you to run out of memory 824 * may cause this to blow up 825 */ 826 /* LINTED E_UNEXPECTED_UINT_PROMOTION */ 827 best = (uint64_t)-size; 828 for (i = 0; i < memlists_used; ++i) { 829 start = memlists[i].addr; 830 #if defined(__xpv) 831 start += mfn_base; 832 #endif 833 end = start + memlists[i].size; 834 835 /* 836 * did we find the desired address? 837 */ 838 if (start <= next_avail_addr && next_avail_addr + size <= end) { 839 best = next_avail_addr; 840 goto done; 841 } 842 843 /* 844 * if not is this address the best so far? 845 */ 846 if (start > next_avail_addr && start < best && 847 RNDUP(start, align) + size <= end) 848 best = RNDUP(start, align); 849 } 850 851 /* 852 * We didn't find exactly the address we wanted, due to going off the 853 * end of a memory region. Return the best found memory address. 854 */ 855 done: 856 next_avail_addr = best + size; 857 #if defined(__xpv) 858 if (next_avail_addr > scratch_end) 859 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: " 860 "0x%lx", (ulong_t)next_avail_addr, 861 (ulong_t)scratch_end); 862 #endif 863 (void) memset((void *)(uintptr_t)best, 0, size); 864 return ((void *)(uintptr_t)best); 865 } 866 867 void * 868 mem_alloc(uint32_t size) 869 { 870 return (do_mem_alloc(size, MMU_PAGESIZE)); 871 } 872 873 874 /* 875 * Build page tables to map all of memory used so far as well as the kernel. 876 */ 877 static void 878 build_page_tables(void) 879 { 880 uint32_t psize; 881 uint32_t level; 882 uint32_t off; 883 uint64_t start; 884 #if !defined(__xpv) 885 uint32_t i; 886 uint64_t end; 887 uint64_t next_mapping; 888 #endif /* __xpv */ 889 890 /* 891 * If we're on metal, we need to create the top level pagetable. 892 */ 893 #if defined(__xpv) 894 top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base; 895 #else /* __xpv */ 896 top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); 897 #endif /* __xpv */ 898 DBG((uintptr_t)top_page_table); 899 900 /* 901 * Determine if we'll use large mappings for kernel, then map it. 902 */ 903 if (largepage_support) { 904 psize = lpagesize; 905 level = 1; 906 } else { 907 psize = MMU_PAGESIZE; 908 level = 0; 909 } 910 911 DBG_MSG("Mapping kernel\n"); 912 DBG(ktext_phys); 913 DBG(target_kernel_text); 914 DBG(ksize); 915 DBG(psize); 916 for (off = 0; off < ksize; off += psize) 917 map_pa_at_va(ktext_phys + off, target_kernel_text + off, level); 918 919 /* 920 * The kernel will need a 1 page window to work with page tables 921 */ 922 bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE); 923 DBG(bi->bi_pt_window); 924 bi->bi_pte_to_pt_window = 925 (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0); 926 DBG(bi->bi_pte_to_pt_window); 927 928 #if defined(__xpv) 929 if (!DOMAIN_IS_INITDOMAIN(xen_info)) { 930 /* If this is a domU we're done. */ 931 DBG_MSG("\nPage tables constructed\n"); 932 return; 933 } 934 #endif /* __xpv */ 935 936 /* 937 * We need 1:1 mappings for the lower 1M of memory to access 938 * BIOS tables used by a couple of drivers during boot. 939 * 940 * The following code works because our simple memory allocator 941 * only grows usage in an upwards direction. 942 * 943 * Note that by this point in boot some mappings for low memory 944 * may already exist because we've already accessed device in low 945 * memory. (Specifically the video frame buffer and keyboard 946 * status ports.) If we're booting on raw hardware then GRUB 947 * created these mappings for us. If we're booting under a 948 * hypervisor then we went ahead and remapped these devices into 949 * memory allocated within dboot itself. 950 */ 951 if (map_debug) 952 dboot_printf("1:1 map pa=0..1Meg\n"); 953 for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) { 954 #if defined(__xpv) 955 map_ma_at_va(start, start, 0); 956 #else /* __xpv */ 957 map_pa_at_va(start, start, 0); 958 #endif /* __xpv */ 959 } 960 961 #if !defined(__xpv) 962 /* 963 * Skip memory between 1M and _start, this acts as a reserve 964 * of memory usable for DMA. 965 */ 966 next_mapping = (uintptr_t)_start & MMU_PAGEMASK; 967 for (i = 0; i < memlists_used; ++i) { 968 start = memlists[i].addr; 969 if (start < next_mapping) 970 start = next_mapping; 971 972 end = start + memlists[i].size; 973 974 if (map_debug) 975 dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n", 976 start, end); 977 while (start < end && start < next_avail_addr) { 978 map_pa_at_va(start, start, 0); 979 start += MMU_PAGESIZE; 980 } 981 } 982 #endif /* !__xpv */ 983 984 DBG_MSG("\nPage tables constructed\n"); 985 } 986 987 #define NO_MULTIBOOT \ 988 "multiboot is no longer used to boot the Solaris Operating System.\n\ 989 The grub entry should be changed to:\n\ 990 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\ 991 module$ /platform/i86pc/$ISADIR/boot_archive\n\ 992 See http://www.sun.com/msg/SUNOS-8000-AK for details.\n" 993 994 /* 995 * startup_kernel has a pretty simple job. It builds pagetables which reflect 996 * 1:1 mappings for all memory in use. It then also adds mappings for 997 * the kernel nucleus at virtual address of target_kernel_text using large page 998 * mappings. The page table pages are also accessible at 1:1 mapped 999 * virtual addresses. 1000 */ 1001 /*ARGSUSED*/ 1002 void 1003 startup_kernel(void) 1004 { 1005 char *cmdline; 1006 uintptr_t addr; 1007 #if defined(__xpv) 1008 physdev_set_iopl_t set_iopl; 1009 #endif /* __xpv */ 1010 1011 /* 1012 * At this point we are executing in a 32 bit real mode. 1013 */ 1014 #if defined(__xpv) 1015 cmdline = (char *)xen_info->cmd_line; 1016 #else /* __xpv */ 1017 cmdline = (char *)mb_info->cmdline; 1018 #endif /* __xpv */ 1019 1020 prom_debug = (strstr(cmdline, "prom_debug") != NULL); 1021 map_debug = (strstr(cmdline, "map_debug") != NULL); 1022 1023 #if defined(__xpv) 1024 /* 1025 * For dom0, before we initialize the console subsystem we'll 1026 * need to enable io operations, so set I/O priveldge level to 1. 1027 */ 1028 if (DOMAIN_IS_INITDOMAIN(xen_info)) { 1029 set_iopl.iopl = 1; 1030 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 1031 } 1032 #endif /* __xpv */ 1033 1034 bcons_init(cmdline); 1035 DBG_MSG("\n\nSolaris prekernel set: "); 1036 DBG_MSG(cmdline); 1037 DBG_MSG("\n"); 1038 1039 if (strstr(cmdline, "multiboot") != NULL) { 1040 dboot_panic(NO_MULTIBOOT); 1041 } 1042 1043 /* 1044 * boot info must be 16 byte aligned for 64 bit kernel ABI 1045 */ 1046 addr = (uintptr_t)boot_info; 1047 addr = (addr + 0xf) & ~0xf; 1048 bi = (struct xboot_info *)addr; 1049 DBG((uintptr_t)bi); 1050 bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline; 1051 1052 /* 1053 * Need correct target_kernel_text value 1054 */ 1055 #if defined(_BOOT_TARGET_amd64) 1056 target_kernel_text = KERNEL_TEXT_amd64; 1057 #elif defined(__xpv) 1058 target_kernel_text = KERNEL_TEXT_i386_xpv; 1059 #else 1060 target_kernel_text = KERNEL_TEXT_i386; 1061 #endif 1062 DBG(target_kernel_text); 1063 1064 #if defined(__xpv) 1065 1066 /* 1067 * XXPV Derive this stuff from CPUID / what the hypervisor has enabled 1068 */ 1069 1070 #if defined(_BOOT_TARGET_amd64) 1071 /* 1072 * 64-bit hypervisor. 1073 */ 1074 amd64_support = 1; 1075 pae_support = 1; 1076 1077 #else /* _BOOT_TARGET_amd64 */ 1078 1079 /* 1080 * See if we are running on a PAE Hypervisor 1081 */ 1082 { 1083 xen_capabilities_info_t caps; 1084 1085 if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0) 1086 dboot_panic("HYPERVISOR_xen_version(caps) failed"); 1087 caps[sizeof (caps) - 1] = 0; 1088 if (prom_debug) 1089 dboot_printf("xen capabilities %s\n", caps); 1090 if (strstr(caps, "x86_32p") != NULL) 1091 pae_support = 1; 1092 } 1093 1094 #endif /* _BOOT_TARGET_amd64 */ 1095 { 1096 xen_platform_parameters_t p; 1097 1098 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0) 1099 dboot_panic("HYPERVISOR_xen_version(parms) failed"); 1100 DBG(p.virt_start); 1101 mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start); 1102 } 1103 1104 /* 1105 * The hypervisor loads stuff starting at 1Gig 1106 */ 1107 mfn_base = ONE_GIG; 1108 DBG(mfn_base); 1109 1110 /* 1111 * enable writable page table mode for the hypervisor 1112 */ 1113 if (HYPERVISOR_vm_assist(VMASST_CMD_enable, 1114 VMASST_TYPE_writable_pagetables) < 0) 1115 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed"); 1116 1117 /* 1118 * check for NX support 1119 */ 1120 if (pae_support) { 1121 uint32_t eax = 0x80000000; 1122 uint32_t edx = get_cpuid_edx(&eax); 1123 1124 if (eax >= 0x80000001) { 1125 eax = 0x80000001; 1126 edx = get_cpuid_edx(&eax); 1127 if (edx & CPUID_AMD_EDX_NX) 1128 NX_support = 1; 1129 } 1130 } 1131 1132 #if !defined(_BOOT_TARGET_amd64) 1133 1134 /* 1135 * The 32-bit hypervisor uses segmentation to protect itself from 1136 * guests. This means when a guest attempts to install a flat 4GB 1137 * code or data descriptor the 32-bit hypervisor will protect itself 1138 * by silently shrinking the segment such that if the guest attempts 1139 * any access where the hypervisor lives a #gp fault is generated. 1140 * The problem is that some applications expect a full 4GB flat 1141 * segment for their current thread pointer and will use negative 1142 * offset segment wrap around to access data. TLS support in linux 1143 * brand is one example of this. 1144 * 1145 * The 32-bit hypervisor can catch the #gp fault in these cases 1146 * and emulate the access without passing the #gp fault to the guest 1147 * but only if VMASST_TYPE_4gb_segments is explicitly turned on. 1148 * Seems like this should have been the default. 1149 * Either way, we want the hypervisor -- and not Solaris -- to deal 1150 * to deal with emulating these accesses. 1151 */ 1152 if (HYPERVISOR_vm_assist(VMASST_CMD_enable, 1153 VMASST_TYPE_4gb_segments) < 0) 1154 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed"); 1155 #endif /* !_BOOT_TARGET_amd64 */ 1156 1157 #else /* __xpv */ 1158 1159 /* 1160 * use cpuid to enable MMU features 1161 */ 1162 if (have_cpuid()) { 1163 uint32_t eax, edx; 1164 1165 eax = 1; 1166 edx = get_cpuid_edx(&eax); 1167 if (edx & CPUID_INTC_EDX_PSE) 1168 largepage_support = 1; 1169 if (edx & CPUID_INTC_EDX_PGE) 1170 pge_support = 1; 1171 if (edx & CPUID_INTC_EDX_PAE) 1172 pae_support = 1; 1173 1174 eax = 0x80000000; 1175 edx = get_cpuid_edx(&eax); 1176 if (eax >= 0x80000001) { 1177 eax = 0x80000001; 1178 edx = get_cpuid_edx(&eax); 1179 if (edx & CPUID_AMD_EDX_LM) 1180 amd64_support = 1; 1181 if (edx & CPUID_AMD_EDX_NX) 1182 NX_support = 1; 1183 } 1184 } else { 1185 dboot_printf("cpuid not supported\n"); 1186 } 1187 #endif /* __xpv */ 1188 1189 1190 #if defined(_BOOT_TARGET_amd64) 1191 if (amd64_support == 0) 1192 dboot_panic("long mode not supported, rebooting"); 1193 else if (pae_support == 0) 1194 dboot_panic("long mode, but no PAE; rebooting"); 1195 #else 1196 /* 1197 * Allow the command line to over-ride use of PAE for 32 bit. 1198 */ 1199 if (strstr(cmdline, "disablePAE=true") != NULL) { 1200 pae_support = 0; 1201 NX_support = 0; 1202 amd64_support = 0; 1203 } 1204 #endif 1205 1206 /* 1207 * initialize the simple memory allocator 1208 */ 1209 init_mem_alloc(); 1210 1211 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64) 1212 /* 1213 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory 1214 */ 1215 if (max_mem < FOUR_GIG && NX_support == 0) 1216 pae_support = 0; 1217 #endif 1218 1219 /* 1220 * configure mmu information 1221 */ 1222 if (pae_support) { 1223 shift_amt = shift_amt_pae; 1224 ptes_per_table = 512; 1225 pte_size = 8; 1226 lpagesize = TWO_MEG; 1227 #if defined(_BOOT_TARGET_amd64) 1228 top_level = 3; 1229 #else 1230 top_level = 2; 1231 #endif 1232 } else { 1233 pae_support = 0; 1234 NX_support = 0; 1235 shift_amt = shift_amt_nopae; 1236 ptes_per_table = 1024; 1237 pte_size = 4; 1238 lpagesize = FOUR_MEG; 1239 top_level = 1; 1240 } 1241 1242 DBG(pge_support); 1243 DBG(NX_support); 1244 DBG(largepage_support); 1245 DBG(amd64_support); 1246 DBG(top_level); 1247 DBG(pte_size); 1248 DBG(ptes_per_table); 1249 DBG(lpagesize); 1250 1251 #if defined(__xpv) 1252 ktext_phys = ONE_GIG; /* from UNIX Mapfile */ 1253 #else 1254 ktext_phys = FOUR_MEG; /* from UNIX Mapfile */ 1255 #endif 1256 1257 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64) 1258 /* 1259 * For grub, copy kernel bits from the ELF64 file to final place. 1260 */ 1261 DBG_MSG("\nAllocating nucleus pages.\n"); 1262 ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG); 1263 if (ktext_phys == 0) 1264 dboot_panic("failed to allocate aligned kernel memory"); 1265 if (dboot_elfload64(mb_header.load_addr) != 0) 1266 dboot_panic("failed to parse kernel ELF image, rebooting"); 1267 #endif 1268 1269 DBG(ktext_phys); 1270 1271 /* 1272 * Allocate page tables. 1273 */ 1274 build_page_tables(); 1275 1276 /* 1277 * return to assembly code to switch to running kernel 1278 */ 1279 entry_addr_low = (uint32_t)target_kernel_text; 1280 DBG(entry_addr_low); 1281 bi->bi_use_largepage = largepage_support; 1282 bi->bi_use_pae = pae_support; 1283 bi->bi_use_pge = pge_support; 1284 bi->bi_use_nx = NX_support; 1285 1286 #if defined(__xpv) 1287 1288 bi->bi_next_paddr = next_avail_addr - mfn_base; 1289 DBG(bi->bi_next_paddr); 1290 bi->bi_next_vaddr = (native_ptr_t)next_avail_addr; 1291 DBG(bi->bi_next_vaddr); 1292 1293 /* 1294 * unmap unused pages in start area to make them available for DMA 1295 */ 1296 while (next_avail_addr < scratch_end) { 1297 (void) HYPERVISOR_update_va_mapping(next_avail_addr, 1298 0, UVMF_INVLPG | UVMF_LOCAL); 1299 next_avail_addr += MMU_PAGESIZE; 1300 } 1301 1302 bi->bi_xen_start_info = (uintptr_t)xen_info; 1303 DBG((uintptr_t)HYPERVISOR_shared_info); 1304 bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info; 1305 bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base; 1306 1307 #else /* __xpv */ 1308 1309 bi->bi_next_paddr = next_avail_addr; 1310 DBG(bi->bi_next_paddr); 1311 bi->bi_next_vaddr = (uintptr_t)next_avail_addr; 1312 DBG(bi->bi_next_vaddr); 1313 bi->bi_mb_info = (uintptr_t)mb_info; 1314 bi->bi_top_page_table = (uintptr_t)top_page_table; 1315 1316 #endif /* __xpv */ 1317 1318 bi->bi_kseg_size = FOUR_MEG; 1319 DBG(bi->bi_kseg_size); 1320 1321 DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n"); 1322 } 1323