1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 *
26 * Copyright 2020 Joyent, Inc.
27 */
28
29
30 #include <sys/types.h>
31 #include <sys/machparam.h>
32 #include <sys/x86_archext.h>
33 #include <sys/systm.h>
34 #include <sys/mach_mmu.h>
35 #include <sys/multiboot.h>
36 #include <sys/multiboot2.h>
37 #include <sys/multiboot2_impl.h>
38 #include <sys/sysmacros.h>
39 #include <sys/framebuffer.h>
40 #include <sys/sha1.h>
41 #include <util/string.h>
42 #include <util/strtolctype.h>
43 #include <sys/efi.h>
44
45 /*
46 * Compile time debug knob. We do not have any early mechanism to control it
47 * as the boot is the earliest mechanism we have, and we do not want to have
48 * it being switched on by default.
49 */
50 int dboot_debug = 0;
51
52 #if defined(__xpv)
53
54 #include <sys/hypervisor.h>
55 uintptr_t xen_virt_start;
56 pfn_t *mfn_to_pfn_mapping;
57
58 #else /* !__xpv */
59
60 extern multiboot_header_t mb_header;
61 extern uint32_t mb2_load_addr;
62 extern int have_cpuid(void);
63
64 #endif /* !__xpv */
65
66 #include <sys/inttypes.h>
67 #include <sys/bootinfo.h>
68 #include <sys/mach_mmu.h>
69 #include <sys/boot_console.h>
70
71 #include "dboot_asm.h"
72 #include "dboot_printf.h"
73 #include "dboot_xboot.h"
74 #include "dboot_elfload.h"
75
76 #define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2)
77
78 /*
79 * This file contains code that runs to transition us from either a multiboot
80 * compliant loader (32 bit non-paging) or a XPV domain loader to
81 * regular kernel execution. Its task is to setup the kernel memory image
82 * and page tables.
83 *
84 * The code executes as:
85 * - 32 bits under GRUB (for 32 or 64 bit Solaris)
86 * - a 32 bit program for the 32-bit PV hypervisor
87 * - a 64 bit program for the 64-bit PV hypervisor (at least for now)
88 *
89 * Under the PV hypervisor, we must create mappings for any memory beyond the
90 * initial start of day allocation (such as the kernel itself).
91 *
92 * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
93 * Since we are running in real mode, so all such memory is accessible.
94 */
95
96 /*
97 * Standard bits used in PTE (page level) and PTP (internal levels)
98 */
99 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
100 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
101
102 /*
103 * This is the target addresses (physical) where the kernel text and data
104 * nucleus pages will be unpacked. On the hypervisor this is actually a
105 * virtual address.
106 */
107 paddr_t ktext_phys;
108 uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */
109
110 static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */
111
112 /*
113 * The stack is setup in assembler before entering startup_kernel()
114 */
115 char stack_space[STACK_SIZE];
116
117 /*
118 * Used to track physical memory allocation
119 */
120 static paddr_t next_avail_addr = 0;
121
122 #if defined(__xpv)
123 /*
124 * Additional information needed for hypervisor memory allocation.
125 * Only memory up to scratch_end is mapped by page tables.
126 * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
127 * to derive a pfn from a pointer, you subtract mfn_base.
128 */
129
130 static paddr_t scratch_end = 0; /* we can't write all of mem here */
131 static paddr_t mfn_base; /* addr corresponding to mfn_list[0] */
132 start_info_t *xen_info;
133
134 #else /* __xpv */
135
136 /*
137 * If on the metal, then we have a multiboot loader.
138 */
139 uint32_t mb_magic; /* magic from boot loader */
140 uint32_t mb_addr; /* multiboot info package from loader */
141 int multiboot_version;
142 multiboot_info_t *mb_info;
143 multiboot2_info_header_t *mb2_info;
144 multiboot_tag_mmap_t *mb2_mmap_tagp;
145 int num_entries; /* mmap entry count */
146 boolean_t num_entries_set; /* is mmap entry count set */
147 uintptr_t load_addr;
148 static boot_framebuffer_t framebuffer __aligned(16);
149 static boot_framebuffer_t *fb;
150
151 /* can not be automatic variables because of alignment */
152 static efi_guid_t smbios3 = SMBIOS3_TABLE_GUID;
153 static efi_guid_t smbios = SMBIOS_TABLE_GUID;
154 static efi_guid_t acpi2 = EFI_ACPI_TABLE_GUID;
155 static efi_guid_t acpi1 = ACPI_10_TABLE_GUID;
156 #endif /* __xpv */
157
158 /*
159 * This contains information passed to the kernel
160 */
161 struct xboot_info boot_info __aligned(16);
162 struct xboot_info *bi;
163
164 /*
165 * Page table and memory stuff.
166 */
167 static paddr_t max_mem; /* maximum memory address */
168
169 /*
170 * Information about processor MMU
171 */
172 int amd64_support = 0;
173 int largepage_support = 0;
174 int pae_support = 0;
175 int pge_support = 0;
176 int NX_support = 0;
177 int PAT_support = 0;
178
179 /*
180 * Low 32 bits of kernel entry address passed back to assembler.
181 * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
182 */
183 uint32_t entry_addr_low;
184
185 /*
186 * Memlists for the kernel. We shouldn't need a lot of these.
187 */
188 #define MAX_MEMLIST (50)
189 struct boot_memlist memlists[MAX_MEMLIST];
190 uint_t memlists_used = 0;
191 struct boot_memlist pcimemlists[MAX_MEMLIST];
192 uint_t pcimemlists_used = 0;
193 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
194 uint_t rsvdmemlists_used = 0;
195
196 /*
197 * This should match what's in the bootloader. It's arbitrary, but GRUB
198 * in particular has limitations on how much space it can use before it
199 * stops working properly. This should be enough.
200 */
201 struct boot_modules modules[MAX_BOOT_MODULES];
202 uint_t modules_used = 0;
203
204 #ifdef __xpv
205 /*
206 * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
207 * definition in Xen source.
208 */
209 typedef struct {
210 uint32_t base_addr_low;
211 uint32_t base_addr_high;
212 uint32_t length_low;
213 uint32_t length_high;
214 uint32_t type;
215 } mmap_t;
216
217 /*
218 * There is 512KB of scratch area after the boot stack page.
219 * We'll use that for everything except the kernel nucleus pages which are too
220 * big to fit there and are allocated last anyway.
221 */
222 #define MAXMAPS 100
223 static mmap_t map_buffer[MAXMAPS];
224 #else
225 typedef mb_memory_map_t mmap_t;
226 #endif
227
228 /*
229 * Debugging macros
230 */
231 uint_t prom_debug = 0;
232 uint_t map_debug = 0;
233
234 static char noname[2] = "-";
235
236 /*
237 * Either hypervisor-specific or grub-specific code builds the initial
238 * memlists. This code does the sort/merge/link for final use.
239 */
240 static void
sort_physinstall(void)241 sort_physinstall(void)
242 {
243 int i;
244 #if !defined(__xpv)
245 int j;
246 struct boot_memlist tmp;
247
248 /*
249 * Now sort the memlists, in case they weren't in order.
250 * Yeah, this is a bubble sort; small, simple and easy to get right.
251 */
252 DBG_MSG("Sorting phys-installed list\n");
253 for (j = memlists_used - 1; j > 0; --j) {
254 for (i = 0; i < j; ++i) {
255 if (memlists[i].addr < memlists[i + 1].addr)
256 continue;
257 tmp = memlists[i];
258 memlists[i] = memlists[i + 1];
259 memlists[i + 1] = tmp;
260 }
261 }
262
263 /*
264 * Merge any memlists that don't have holes between them.
265 */
266 for (i = 0; i <= memlists_used - 1; ++i) {
267 if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
268 continue;
269
270 if (prom_debug)
271 dboot_printf(
272 "merging mem segs %" PRIx64 "...%" PRIx64
273 " w/ %" PRIx64 "...%" PRIx64 "\n",
274 memlists[i].addr,
275 memlists[i].addr + memlists[i].size,
276 memlists[i + 1].addr,
277 memlists[i + 1].addr + memlists[i + 1].size);
278
279 memlists[i].size += memlists[i + 1].size;
280 for (j = i + 1; j < memlists_used - 1; ++j)
281 memlists[j] = memlists[j + 1];
282 --memlists_used;
283 DBG(memlists_used);
284 --i; /* after merging we need to reexamine, so do this */
285 }
286 #endif /* __xpv */
287
288 if (prom_debug) {
289 dboot_printf("\nFinal memlists:\n");
290 for (i = 0; i < memlists_used; ++i) {
291 dboot_printf("\t%d: addr=%" PRIx64 " size=%"
292 PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
293 }
294 }
295
296 /*
297 * link together the memlists with native size pointers
298 */
299 memlists[0].next = 0;
300 memlists[0].prev = 0;
301 for (i = 1; i < memlists_used; ++i) {
302 memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
303 memlists[i].next = 0;
304 memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
305 }
306 bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
307 DBG(bi->bi_phys_install);
308 }
309
310 /*
311 * build bios reserved memlists
312 */
313 static void
build_rsvdmemlists(void)314 build_rsvdmemlists(void)
315 {
316 int i;
317
318 rsvdmemlists[0].next = 0;
319 rsvdmemlists[0].prev = 0;
320 for (i = 1; i < rsvdmemlists_used; ++i) {
321 rsvdmemlists[i].prev =
322 (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
323 rsvdmemlists[i].next = 0;
324 rsvdmemlists[i - 1].next =
325 (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
326 }
327 bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
328 DBG(bi->bi_rsvdmem);
329 }
330
331 #if defined(__xpv)
332
333 /*
334 * halt on the hypervisor after a delay to drain console output
335 */
336 void
dboot_halt(void)337 dboot_halt(void)
338 {
339 uint_t i = 10000;
340
341 while (--i)
342 (void) HYPERVISOR_yield();
343 (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
344 }
345
346 /*
347 * From a machine address, find the corresponding pseudo-physical address.
348 * Pseudo-physical address are contiguous and run from mfn_base in each VM.
349 * Machine addresses are the real underlying hardware addresses.
350 * These are needed for page table entries. Note that this routine is
351 * poorly protected. A bad value of "ma" will cause a page fault.
352 */
353 paddr_t
ma_to_pa(maddr_t ma)354 ma_to_pa(maddr_t ma)
355 {
356 ulong_t pgoff = ma & MMU_PAGEOFFSET;
357 ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
358 paddr_t pa;
359
360 if (pfn >= xen_info->nr_pages)
361 return (-(paddr_t)1);
362 pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
363 #ifdef DEBUG
364 if (ma != pa_to_ma(pa))
365 dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
366 "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
367 #endif
368 return (pa);
369 }
370
371 /*
372 * From a pseudo-physical address, find the corresponding machine address.
373 */
374 maddr_t
pa_to_ma(paddr_t pa)375 pa_to_ma(paddr_t pa)
376 {
377 pfn_t pfn;
378 ulong_t mfn;
379
380 pfn = mmu_btop(pa - mfn_base);
381 if (pa < mfn_base || pfn >= xen_info->nr_pages)
382 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
383 mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
384 #ifdef DEBUG
385 if (mfn_to_pfn_mapping[mfn] != pfn)
386 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
387 pfn, mfn, mfn_to_pfn_mapping[mfn]);
388 #endif
389 return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
390 }
391
392 #endif /* __xpv */
393
394 x86pte_t
get_pteval(paddr_t table,uint_t index)395 get_pteval(paddr_t table, uint_t index)
396 {
397 if (pae_support)
398 return (((x86pte_t *)(uintptr_t)table)[index]);
399 return (((x86pte32_t *)(uintptr_t)table)[index]);
400 }
401
402 /*ARGSUSED*/
403 void
set_pteval(paddr_t table,uint_t index,uint_t level,x86pte_t pteval)404 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
405 {
406 #ifdef __xpv
407 mmu_update_t t;
408 maddr_t mtable = pa_to_ma(table);
409 int retcnt;
410
411 t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
412 t.val = pteval;
413 if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
414 dboot_panic("HYPERVISOR_mmu_update() failed");
415 #else /* __xpv */
416 uintptr_t tab_addr = (uintptr_t)table;
417
418 if (pae_support)
419 ((x86pte_t *)tab_addr)[index] = pteval;
420 else
421 ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
422 if (level == top_level && level == 2)
423 reload_cr3();
424 #endif /* __xpv */
425 }
426
427 paddr_t
make_ptable(x86pte_t * pteval,uint_t level)428 make_ptable(x86pte_t *pteval, uint_t level)
429 {
430 paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
431
432 if (level == top_level && level == 2)
433 *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
434 else
435 *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
436
437 #ifdef __xpv
438 /* Remove write permission to the new page table. */
439 if (HYPERVISOR_update_va_mapping(new_table,
440 *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
441 dboot_panic("HYP_update_va_mapping error");
442 #endif
443
444 if (map_debug)
445 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
446 PRIx64 "\n", level, (ulong_t)new_table, *pteval);
447 return (new_table);
448 }
449
450 x86pte_t *
map_pte(paddr_t table,uint_t index)451 map_pte(paddr_t table, uint_t index)
452 {
453 return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
454 }
455
456 /*
457 * dump out the contents of page tables...
458 */
459 static void
dump_tables(void)460 dump_tables(void)
461 {
462 uint_t save_index[4]; /* for recursion */
463 char *save_table[4]; /* for recursion */
464 uint_t l;
465 uint64_t va;
466 uint64_t pgsize;
467 int index;
468 int i;
469 x86pte_t pteval;
470 char *table;
471 static char *tablist = "\t\t\t";
472 char *tabs = tablist + 3 - top_level;
473 uint_t pa, pa1;
474 #if !defined(__xpv)
475 #define maddr_t paddr_t
476 #endif /* !__xpv */
477
478 dboot_printf("Finished pagetables:\n");
479 table = (char *)(uintptr_t)top_page_table;
480 l = top_level;
481 va = 0;
482 for (index = 0; index < ptes_per_table; ++index) {
483 pgsize = 1ull << shift_amt[l];
484 if (pae_support)
485 pteval = ((x86pte_t *)table)[index];
486 else
487 pteval = ((x86pte32_t *)table)[index];
488 if (pteval == 0)
489 goto next_entry;
490
491 dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
492 tabs + l, (void *)table, index, (uint64_t)pteval, va);
493 pa = ma_to_pa(pteval & MMU_PAGEMASK);
494 dboot_printf(" physaddr=%x\n", pa);
495
496 /*
497 * Don't try to walk hypervisor private pagetables
498 */
499 if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
500 save_table[l] = table;
501 save_index[l] = index;
502 --l;
503 index = -1;
504 table = (char *)(uintptr_t)
505 ma_to_pa(pteval & MMU_PAGEMASK);
506 goto recursion;
507 }
508
509 /*
510 * shorten dump for consecutive mappings
511 */
512 for (i = 1; index + i < ptes_per_table; ++i) {
513 if (pae_support)
514 pteval = ((x86pte_t *)table)[index + i];
515 else
516 pteval = ((x86pte32_t *)table)[index + i];
517 if (pteval == 0)
518 break;
519 pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
520 if (pa1 != pa + i * pgsize)
521 break;
522 }
523 if (i > 2) {
524 dboot_printf("%s...\n", tabs + l);
525 va += pgsize * (i - 2);
526 index += i - 2;
527 }
528 next_entry:
529 va += pgsize;
530 if (l == 3 && index == 255) /* VA hole */
531 va = 0xffff800000000000ull;
532 recursion:
533 ;
534 }
535 if (l < top_level) {
536 ++l;
537 index = save_index[l];
538 table = save_table[l];
539 goto recursion;
540 }
541 }
542
543 /*
544 * Add a mapping for the machine page at the given virtual address.
545 */
546 static void
map_ma_at_va(maddr_t ma,native_ptr_t va,uint_t level)547 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
548 {
549 x86pte_t *ptep;
550 x86pte_t pteval;
551
552 pteval = ma | pte_bits;
553 if (level > 0)
554 pteval |= PT_PAGESIZE;
555 if (va >= target_kernel_text && pge_support)
556 pteval |= PT_GLOBAL;
557
558 if (map_debug && ma != va)
559 dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
560 " pte=0x%" PRIx64 " l=%d\n",
561 (uint64_t)ma, (uint64_t)va, pteval, level);
562
563 #if defined(__xpv)
564 /*
565 * see if we can avoid find_pte() on the hypervisor
566 */
567 if (HYPERVISOR_update_va_mapping(va, pteval,
568 UVMF_INVLPG | UVMF_LOCAL) == 0)
569 return;
570 #endif
571
572 /*
573 * Find the pte that will map this address. This creates any
574 * missing intermediate level page tables
575 */
576 ptep = find_pte(va, NULL, level, 0);
577
578 /*
579 * When paravirtualized, we must use hypervisor calls to modify the
580 * PTE, since paging is active. On real hardware we just write to
581 * the pagetables which aren't in use yet.
582 */
583 #if defined(__xpv)
584 ptep = ptep; /* shut lint up */
585 if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
586 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
587 " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
588 (uint64_t)va, level, (uint64_t)ma, pteval);
589 #else
590 if (va < 1024 * 1024)
591 pteval |= PT_NOCACHE; /* for video RAM */
592 if (pae_support)
593 *ptep = pteval;
594 else
595 *((x86pte32_t *)ptep) = (x86pte32_t)pteval;
596 #endif
597 }
598
599 /*
600 * Add a mapping for the physical page at the given virtual address.
601 */
602 static void
map_pa_at_va(paddr_t pa,native_ptr_t va,uint_t level)603 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
604 {
605 map_ma_at_va(pa_to_ma(pa), va, level);
606 }
607
608 /*
609 * This is called to remove start..end from the
610 * possible range of PCI addresses.
611 */
612 const uint64_t pci_lo_limit = 0x00100000ul;
613 const uint64_t pci_hi_limit = 0xfff00000ul;
614 static void
exclude_from_pci(uint64_t start,uint64_t end)615 exclude_from_pci(uint64_t start, uint64_t end)
616 {
617 int i;
618 int j;
619 struct boot_memlist *ml;
620
621 for (i = 0; i < pcimemlists_used; ++i) {
622 ml = &pcimemlists[i];
623
624 /* delete the entire range? */
625 if (start <= ml->addr && ml->addr + ml->size <= end) {
626 --pcimemlists_used;
627 for (j = i; j < pcimemlists_used; ++j)
628 pcimemlists[j] = pcimemlists[j + 1];
629 --i; /* to revisit the new one at this index */
630 }
631
632 /* split a range? */
633 else if (ml->addr < start && end < ml->addr + ml->size) {
634
635 ++pcimemlists_used;
636 if (pcimemlists_used > MAX_MEMLIST)
637 dboot_panic("too many pcimemlists");
638
639 for (j = pcimemlists_used - 1; j > i; --j)
640 pcimemlists[j] = pcimemlists[j - 1];
641 ml->size = start - ml->addr;
642
643 ++ml;
644 ml->size = (ml->addr + ml->size) - end;
645 ml->addr = end;
646 ++i; /* skip on to next one */
647 }
648
649 /* cut memory off the start? */
650 else if (ml->addr < end && end < ml->addr + ml->size) {
651 ml->size -= end - ml->addr;
652 ml->addr = end;
653 }
654
655 /* cut memory off the end? */
656 else if (ml->addr <= start && start < ml->addr + ml->size) {
657 ml->size = start - ml->addr;
658 }
659 }
660 }
661
662 /*
663 * During memory allocation, find the highest address not used yet.
664 */
665 static void
check_higher(paddr_t a)666 check_higher(paddr_t a)
667 {
668 if (a < next_avail_addr)
669 return;
670 next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
671 DBG(next_avail_addr);
672 }
673
674 static int
dboot_loader_mmap_entries(void)675 dboot_loader_mmap_entries(void)
676 {
677 #if !defined(__xpv)
678 if (num_entries_set == B_TRUE)
679 return (num_entries);
680
681 switch (multiboot_version) {
682 case 1:
683 DBG(mb_info->flags);
684 if (mb_info->flags & 0x40) {
685 mb_memory_map_t *mmap;
686 caddr32_t mmap_addr;
687
688 DBG(mb_info->mmap_addr);
689 DBG(mb_info->mmap_length);
690 check_higher(mb_info->mmap_addr + mb_info->mmap_length);
691
692 for (mmap_addr = mb_info->mmap_addr;
693 mmap_addr < mb_info->mmap_addr +
694 mb_info->mmap_length;
695 mmap_addr += mmap->size + sizeof (mmap->size)) {
696 mmap = (mb_memory_map_t *)(uintptr_t)mmap_addr;
697 ++num_entries;
698 }
699
700 num_entries_set = B_TRUE;
701 }
702 break;
703 case 2:
704 num_entries_set = B_TRUE;
705 num_entries = dboot_multiboot2_mmap_nentries(mb2_info,
706 mb2_mmap_tagp);
707 break;
708 default:
709 dboot_panic("Unknown multiboot version: %d\n",
710 multiboot_version);
711 break;
712 }
713 return (num_entries);
714 #else
715 return (MAXMAPS);
716 #endif
717 }
718
719 static uint32_t
dboot_loader_mmap_get_type(int index)720 dboot_loader_mmap_get_type(int index)
721 {
722 #if !defined(__xpv)
723 mb_memory_map_t *mp, *mpend;
724 int i;
725
726 switch (multiboot_version) {
727 case 1:
728 mp = (mb_memory_map_t *)(uintptr_t)mb_info->mmap_addr;
729 mpend = (mb_memory_map_t *)(uintptr_t)
730 (mb_info->mmap_addr + mb_info->mmap_length);
731
732 for (i = 0; mp < mpend && i != index; i++)
733 mp = (mb_memory_map_t *)((uintptr_t)mp + mp->size +
734 sizeof (mp->size));
735 if (mp >= mpend) {
736 dboot_panic("dboot_loader_mmap_get_type(): index "
737 "out of bounds: %d\n", index);
738 }
739 return (mp->type);
740
741 case 2:
742 return (dboot_multiboot2_mmap_get_type(mb2_info,
743 mb2_mmap_tagp, index));
744
745 default:
746 dboot_panic("Unknown multiboot version: %d\n",
747 multiboot_version);
748 break;
749 }
750 return (0);
751 #else
752 return (map_buffer[index].type);
753 #endif
754 }
755
756 static uint64_t
dboot_loader_mmap_get_base(int index)757 dboot_loader_mmap_get_base(int index)
758 {
759 #if !defined(__xpv)
760 mb_memory_map_t *mp, *mpend;
761 int i;
762
763 switch (multiboot_version) {
764 case 1:
765 mp = (mb_memory_map_t *)mb_info->mmap_addr;
766 mpend = (mb_memory_map_t *)
767 (mb_info->mmap_addr + mb_info->mmap_length);
768
769 for (i = 0; mp < mpend && i != index; i++)
770 mp = (mb_memory_map_t *)((uintptr_t)mp + mp->size +
771 sizeof (mp->size));
772 if (mp >= mpend) {
773 dboot_panic("dboot_loader_mmap_get_base(): index "
774 "out of bounds: %d\n", index);
775 }
776 return (((uint64_t)mp->base_addr_high << 32) +
777 (uint64_t)mp->base_addr_low);
778
779 case 2:
780 return (dboot_multiboot2_mmap_get_base(mb2_info,
781 mb2_mmap_tagp, index));
782
783 default:
784 dboot_panic("Unknown multiboot version: %d\n",
785 multiboot_version);
786 break;
787 }
788 return (0);
789 #else
790 return (((uint64_t)map_buffer[index].base_addr_high << 32) +
791 (uint64_t)map_buffer[index].base_addr_low);
792 #endif
793 }
794
795 static uint64_t
dboot_loader_mmap_get_length(int index)796 dboot_loader_mmap_get_length(int index)
797 {
798 #if !defined(__xpv)
799 mb_memory_map_t *mp, *mpend;
800 int i;
801
802 switch (multiboot_version) {
803 case 1:
804 mp = (mb_memory_map_t *)mb_info->mmap_addr;
805 mpend = (mb_memory_map_t *)
806 (mb_info->mmap_addr + mb_info->mmap_length);
807
808 for (i = 0; mp < mpend && i != index; i++)
809 mp = (mb_memory_map_t *)((uintptr_t)mp + mp->size +
810 sizeof (mp->size));
811 if (mp >= mpend) {
812 dboot_panic("dboot_loader_mmap_get_length(): index "
813 "out of bounds: %d\n", index);
814 }
815 return (((uint64_t)mp->length_high << 32) +
816 (uint64_t)mp->length_low);
817
818 case 2:
819 return (dboot_multiboot2_mmap_get_length(mb2_info,
820 mb2_mmap_tagp, index));
821
822 default:
823 dboot_panic("Unknown multiboot version: %d\n",
824 multiboot_version);
825 break;
826 }
827 return (0);
828 #else
829 return (((uint64_t)map_buffer[index].length_high << 32) +
830 (uint64_t)map_buffer[index].length_low);
831 #endif
832 }
833
834 static void
build_pcimemlists(void)835 build_pcimemlists(void)
836 {
837 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
838 uint64_t start;
839 uint64_t end;
840 int i, num;
841
842 /*
843 * initialize
844 */
845 pcimemlists[0].addr = pci_lo_limit;
846 pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
847 pcimemlists_used = 1;
848
849 num = dboot_loader_mmap_entries();
850 /*
851 * Fill in PCI memlists.
852 */
853 for (i = 0; i < num; ++i) {
854 start = dboot_loader_mmap_get_base(i);
855 end = start + dboot_loader_mmap_get_length(i);
856
857 if (prom_debug)
858 dboot_printf("\ttype: %d %" PRIx64 "..%"
859 PRIx64 "\n", dboot_loader_mmap_get_type(i),
860 start, end);
861
862 /*
863 * page align start and end
864 */
865 start = (start + page_offset) & ~page_offset;
866 end &= ~page_offset;
867 if (end <= start)
868 continue;
869
870 exclude_from_pci(start, end);
871 }
872
873 /*
874 * Finish off the pcimemlist
875 */
876 if (prom_debug) {
877 for (i = 0; i < pcimemlists_used; ++i) {
878 dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
879 PRIx64 "\n", pcimemlists[i].addr,
880 pcimemlists[i].addr + pcimemlists[i].size);
881 }
882 }
883 pcimemlists[0].next = 0;
884 pcimemlists[0].prev = 0;
885 for (i = 1; i < pcimemlists_used; ++i) {
886 pcimemlists[i].prev =
887 (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
888 pcimemlists[i].next = 0;
889 pcimemlists[i - 1].next =
890 (native_ptr_t)(uintptr_t)(pcimemlists + i);
891 }
892 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
893 DBG(bi->bi_pcimem);
894 }
895
896 #if defined(__xpv)
897 /*
898 * Initialize memory allocator stuff from hypervisor-supplied start info.
899 */
900 static void
init_mem_alloc(void)901 init_mem_alloc(void)
902 {
903 int local; /* variables needed to find start region */
904 paddr_t scratch_start;
905 xen_memory_map_t map;
906
907 DBG_MSG("Entered init_mem_alloc()\n");
908
909 /*
910 * Free memory follows the stack. There's at least 512KB of scratch
911 * space, rounded up to at least 2Mb alignment. That should be enough
912 * for the page tables we'll need to build. The nucleus memory is
913 * allocated last and will be outside the addressible range. We'll
914 * switch to new page tables before we unpack the kernel
915 */
916 scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
917 DBG(scratch_start);
918 scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
919 DBG(scratch_end);
920
921 /*
922 * For paranoia, leave some space between hypervisor data and ours.
923 * Use 500 instead of 512.
924 */
925 next_avail_addr = scratch_end - 500 * 1024;
926 DBG(next_avail_addr);
927
928 /*
929 * The domain builder gives us at most 1 module
930 */
931 DBG(xen_info->mod_len);
932 if (xen_info->mod_len > 0) {
933 DBG(xen_info->mod_start);
934 modules[0].bm_addr =
935 (native_ptr_t)(uintptr_t)xen_info->mod_start;
936 modules[0].bm_size = xen_info->mod_len;
937 bi->bi_module_cnt = 1;
938 bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
939 } else {
940 bi->bi_module_cnt = 0;
941 bi->bi_modules = (native_ptr_t)(uintptr_t)NULL;
942 }
943 DBG(bi->bi_module_cnt);
944 DBG(bi->bi_modules);
945
946 DBG(xen_info->mfn_list);
947 DBG(xen_info->nr_pages);
948 max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
949 DBG(max_mem);
950
951 /*
952 * Using pseudo-physical addresses, so only 1 memlist element
953 */
954 memlists[0].addr = 0;
955 DBG(memlists[0].addr);
956 memlists[0].size = max_mem;
957 DBG(memlists[0].size);
958 memlists_used = 1;
959 DBG(memlists_used);
960
961 /*
962 * finish building physinstall list
963 */
964 sort_physinstall();
965
966 /*
967 * build bios reserved memlists
968 */
969 build_rsvdmemlists();
970
971 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
972 /*
973 * build PCI Memory list
974 */
975 map.nr_entries = MAXMAPS;
976 /*LINTED: constant in conditional context*/
977 set_xen_guest_handle(map.buffer, map_buffer);
978 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
979 dboot_panic("getting XENMEM_machine_memory_map failed");
980 build_pcimemlists();
981 }
982 }
983
984 #else /* !__xpv */
985
986 static void
dboot_multiboot1_xboot_consinfo(void)987 dboot_multiboot1_xboot_consinfo(void)
988 {
989 fb->framebuffer = 0;
990 }
991
992 static void
dboot_multiboot2_xboot_consinfo(void)993 dboot_multiboot2_xboot_consinfo(void)
994 {
995 multiboot_tag_framebuffer_t *fbtag;
996 fbtag = dboot_multiboot2_find_tag(mb2_info,
997 MULTIBOOT_TAG_TYPE_FRAMEBUFFER);
998 fb->framebuffer = (uint64_t)(uintptr_t)fbtag;
999 }
1000
1001 static int
dboot_multiboot_modcount(void)1002 dboot_multiboot_modcount(void)
1003 {
1004 switch (multiboot_version) {
1005 case 1:
1006 return (mb_info->mods_count);
1007
1008 case 2:
1009 return (dboot_multiboot2_modcount(mb2_info));
1010
1011 default:
1012 dboot_panic("Unknown multiboot version: %d\n",
1013 multiboot_version);
1014 break;
1015 }
1016 return (0);
1017 }
1018
1019 static uint32_t
dboot_multiboot_modstart(int index)1020 dboot_multiboot_modstart(int index)
1021 {
1022 switch (multiboot_version) {
1023 case 1:
1024 return (((mb_module_t *)mb_info->mods_addr)[index].mod_start);
1025
1026 case 2:
1027 return (dboot_multiboot2_modstart(mb2_info, index));
1028
1029 default:
1030 dboot_panic("Unknown multiboot version: %d\n",
1031 multiboot_version);
1032 break;
1033 }
1034 return (0);
1035 }
1036
1037 static uint32_t
dboot_multiboot_modend(int index)1038 dboot_multiboot_modend(int index)
1039 {
1040 switch (multiboot_version) {
1041 case 1:
1042 return (((mb_module_t *)mb_info->mods_addr)[index].mod_end);
1043
1044 case 2:
1045 return (dboot_multiboot2_modend(mb2_info, index));
1046
1047 default:
1048 dboot_panic("Unknown multiboot version: %d\n",
1049 multiboot_version);
1050 break;
1051 }
1052 return (0);
1053 }
1054
1055 static char *
dboot_multiboot_modcmdline(int index)1056 dboot_multiboot_modcmdline(int index)
1057 {
1058 switch (multiboot_version) {
1059 case 1:
1060 return ((char *)((mb_module_t *)
1061 mb_info->mods_addr)[index].mod_name);
1062
1063 case 2:
1064 return (dboot_multiboot2_modcmdline(mb2_info, index));
1065
1066 default:
1067 dboot_panic("Unknown multiboot version: %d\n",
1068 multiboot_version);
1069 break;
1070 }
1071 return (0);
1072 }
1073
1074 /*
1075 * Find the modules used by console setup.
1076 * Since we need the console to print early boot messages, the console is set up
1077 * before anything else and therefore we need to pick up the needed modules.
1078 *
1079 * Note, we just will search for and if found, will pass the modules
1080 * to console setup, the proper module list processing will happen later.
1081 * Currently used modules are boot environment and console font.
1082 */
1083 static void
dboot_find_console_modules(void)1084 dboot_find_console_modules(void)
1085 {
1086 int i, modcount;
1087 uint32_t mod_start, mod_end;
1088 char *cmdline;
1089
1090 modcount = dboot_multiboot_modcount();
1091 bi->bi_module_cnt = 0;
1092 for (i = 0; i < modcount; ++i) {
1093 cmdline = dboot_multiboot_modcmdline(i);
1094 if (cmdline == NULL)
1095 continue;
1096
1097 if (strstr(cmdline, "type=console-font") != NULL)
1098 modules[bi->bi_module_cnt].bm_type = BMT_FONT;
1099 else if (strstr(cmdline, "type=environment") != NULL)
1100 modules[bi->bi_module_cnt].bm_type = BMT_ENV;
1101 else
1102 continue;
1103
1104 mod_start = dboot_multiboot_modstart(i);
1105 mod_end = dboot_multiboot_modend(i);
1106 modules[bi->bi_module_cnt].bm_addr =
1107 (native_ptr_t)(uintptr_t)mod_start;
1108 modules[bi->bi_module_cnt].bm_size = mod_end - mod_start;
1109 modules[bi->bi_module_cnt].bm_name =
1110 (native_ptr_t)(uintptr_t)NULL;
1111 modules[bi->bi_module_cnt].bm_hash =
1112 (native_ptr_t)(uintptr_t)NULL;
1113 bi->bi_module_cnt++;
1114 }
1115 if (bi->bi_module_cnt != 0)
1116 bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1117 }
1118
1119 static boolean_t
dboot_multiboot_basicmeminfo(uint32_t * lower,uint32_t * upper)1120 dboot_multiboot_basicmeminfo(uint32_t *lower, uint32_t *upper)
1121 {
1122 boolean_t rv = B_FALSE;
1123
1124 switch (multiboot_version) {
1125 case 1:
1126 if (mb_info->flags & 0x01) {
1127 *lower = mb_info->mem_lower;
1128 *upper = mb_info->mem_upper;
1129 rv = B_TRUE;
1130 }
1131 break;
1132
1133 case 2:
1134 return (dboot_multiboot2_basicmeminfo(mb2_info, lower, upper));
1135
1136 default:
1137 dboot_panic("Unknown multiboot version: %d\n",
1138 multiboot_version);
1139 break;
1140 }
1141 return (rv);
1142 }
1143
1144 static uint8_t
dboot_a2h(char v)1145 dboot_a2h(char v)
1146 {
1147 if (v >= 'a')
1148 return (v - 'a' + 0xa);
1149 else if (v >= 'A')
1150 return (v - 'A' + 0xa);
1151 else if (v >= '0')
1152 return (v - '0');
1153 else
1154 dboot_panic("bad ASCII hex character %c\n", v);
1155
1156 return (0);
1157 }
1158
1159 static void
digest_a2h(const char * ascii,uint8_t * digest)1160 digest_a2h(const char *ascii, uint8_t *digest)
1161 {
1162 unsigned int i;
1163
1164 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1165 digest[i] = dboot_a2h(ascii[i * 2]) << 4;
1166 digest[i] |= dboot_a2h(ascii[i * 2 + 1]);
1167 }
1168 }
1169
1170 /*
1171 * Generate a SHA-1 hash of the first len bytes of image, and compare it with
1172 * the ASCII-format hash found in the 40-byte buffer at ascii. If they
1173 * match, return 0, otherwise -1. This works only for images smaller than
1174 * 4 GB, which should not be a problem.
1175 */
1176 static int
check_image_hash(uint_t midx)1177 check_image_hash(uint_t midx)
1178 {
1179 const char *ascii;
1180 const void *image;
1181 size_t len;
1182 SHA1_CTX ctx;
1183 uint8_t digest[SHA1_DIGEST_LENGTH];
1184 uint8_t baseline[SHA1_DIGEST_LENGTH];
1185 unsigned int i;
1186
1187 ascii = (const char *)(uintptr_t)modules[midx].bm_hash;
1188 image = (const void *)(uintptr_t)modules[midx].bm_addr;
1189 len = (size_t)modules[midx].bm_size;
1190
1191 digest_a2h(ascii, baseline);
1192
1193 SHA1Init(&ctx);
1194 SHA1Update(&ctx, image, len);
1195 SHA1Final(digest, &ctx);
1196
1197 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1198 if (digest[i] != baseline[i])
1199 return (-1);
1200 }
1201
1202 return (0);
1203 }
1204
1205 static const char *
type_to_str(boot_module_type_t type)1206 type_to_str(boot_module_type_t type)
1207 {
1208 switch (type) {
1209 case BMT_ROOTFS:
1210 return ("rootfs");
1211 case BMT_FILE:
1212 return ("file");
1213 case BMT_HASH:
1214 return ("hash");
1215 case BMT_ENV:
1216 return ("environment");
1217 case BMT_FONT:
1218 return ("console-font");
1219 default:
1220 return ("unknown");
1221 }
1222 }
1223
1224 static void
check_images(void)1225 check_images(void)
1226 {
1227 uint_t i;
1228 char displayhash[SHA1_ASCII_LENGTH + 1];
1229
1230 for (i = 0; i < modules_used; i++) {
1231 if (prom_debug) {
1232 dboot_printf("module #%d: name %s type %s "
1233 "addr %lx size %lx\n",
1234 i, (char *)(uintptr_t)modules[i].bm_name,
1235 type_to_str(modules[i].bm_type),
1236 (ulong_t)modules[i].bm_addr,
1237 (ulong_t)modules[i].bm_size);
1238 }
1239
1240 if (modules[i].bm_type == BMT_HASH ||
1241 modules[i].bm_hash == (native_ptr_t)(uintptr_t)NULL) {
1242 DBG_MSG("module has no hash; skipping check\n");
1243 continue;
1244 }
1245 (void) memcpy(displayhash,
1246 (void *)(uintptr_t)modules[i].bm_hash,
1247 SHA1_ASCII_LENGTH);
1248 displayhash[SHA1_ASCII_LENGTH] = '\0';
1249 if (prom_debug) {
1250 dboot_printf("checking expected hash [%s]: ",
1251 displayhash);
1252 }
1253
1254 if (check_image_hash(i) != 0)
1255 dboot_panic("hash mismatch!\n");
1256 else
1257 DBG_MSG("OK\n");
1258 }
1259 }
1260
1261 /*
1262 * Determine the module's starting address, size, name, and type, and fill the
1263 * boot_modules structure. This structure is used by the bop code, except for
1264 * hashes which are checked prior to transferring control to the kernel.
1265 */
1266 static void
process_module(int midx)1267 process_module(int midx)
1268 {
1269 uint32_t mod_start = dboot_multiboot_modstart(midx);
1270 uint32_t mod_end = dboot_multiboot_modend(midx);
1271 char *cmdline = dboot_multiboot_modcmdline(midx);
1272 char *p, *q;
1273
1274 check_higher(mod_end);
1275 if (prom_debug) {
1276 dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n",
1277 midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end);
1278 }
1279
1280 if (mod_start > mod_end) {
1281 dboot_panic("module #%d: module start address 0x%lx greater "
1282 "than end address 0x%lx", midx,
1283 (ulong_t)mod_start, (ulong_t)mod_end);
1284 }
1285
1286 /*
1287 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
1288 * the address of the last valid byte in a module plus 1 as mod_end.
1289 * This is of course a bug; the multiboot specification simply states
1290 * that mod_start and mod_end "contain the start and end addresses of
1291 * the boot module itself" which is pretty obviously not what GRUB is
1292 * doing. However, fixing it requires that not only this code be
1293 * changed but also that other code consuming this value and values
1294 * derived from it be fixed, and that the kernel and GRUB must either
1295 * both have the bug or neither. While there are a lot of combinations
1296 * that will work, there are also some that won't, so for simplicity
1297 * we'll just cope with the bug. That means we won't actually hash the
1298 * byte at mod_end, and we will expect that mod_end for the hash file
1299 * itself is one greater than some multiple of 41 (40 bytes of ASCII
1300 * hash plus a newline for each module). We set bm_size to the true
1301 * correct number of bytes in each module, achieving exactly this.
1302 */
1303
1304 modules[midx].bm_addr = (native_ptr_t)(uintptr_t)mod_start;
1305 modules[midx].bm_size = mod_end - mod_start;
1306 modules[midx].bm_name = (native_ptr_t)(uintptr_t)cmdline;
1307 modules[midx].bm_hash = (native_ptr_t)(uintptr_t)NULL;
1308 modules[midx].bm_type = BMT_FILE;
1309
1310 if (cmdline == NULL) {
1311 modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname;
1312 return;
1313 }
1314
1315 p = cmdline;
1316 modules[midx].bm_name =
1317 (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r");
1318
1319 while (p != NULL) {
1320 q = strsep(&p, " \t\f\n\r");
1321 if (strncmp(q, "name=", 5) == 0) {
1322 if (q[5] != '\0' && !isspace(q[5])) {
1323 modules[midx].bm_name =
1324 (native_ptr_t)(uintptr_t)(q + 5);
1325 }
1326 continue;
1327 }
1328
1329 if (strncmp(q, "type=", 5) == 0) {
1330 if (q[5] == '\0' || isspace(q[5]))
1331 continue;
1332 q += 5;
1333 if (strcmp(q, "rootfs") == 0) {
1334 modules[midx].bm_type = BMT_ROOTFS;
1335 } else if (strcmp(q, "hash") == 0) {
1336 modules[midx].bm_type = BMT_HASH;
1337 } else if (strcmp(q, "environment") == 0) {
1338 modules[midx].bm_type = BMT_ENV;
1339 } else if (strcmp(q, "console-font") == 0) {
1340 modules[midx].bm_type = BMT_FONT;
1341 } else if (strcmp(q, "file") != 0) {
1342 dboot_printf("\tmodule #%d: unknown module "
1343 "type '%s'; defaulting to 'file'\n",
1344 midx, q);
1345 }
1346 continue;
1347 }
1348
1349 if (strncmp(q, "hash=", 5) == 0) {
1350 if (q[5] != '\0' && !isspace(q[5])) {
1351 modules[midx].bm_hash =
1352 (native_ptr_t)(uintptr_t)(q + 5);
1353 }
1354 continue;
1355 }
1356
1357 dboot_printf("ignoring unknown option '%s'\n", q);
1358 }
1359 }
1360
1361 /*
1362 * Backward compatibility: if there are exactly one or two modules, both
1363 * of type 'file' and neither with an embedded hash value, we have been
1364 * given the legacy style modules. In this case we need to treat the first
1365 * module as a rootfs and the second as a hash referencing that module.
1366 * Otherwise, even if the configuration is invalid, we assume that the
1367 * operator knows what he's doing or at least isn't being bitten by this
1368 * interface change.
1369 */
1370 static void
fixup_modules(void)1371 fixup_modules(void)
1372 {
1373 if (modules_used == 0 || modules_used > 2)
1374 return;
1375
1376 if (modules[0].bm_type != BMT_FILE ||
1377 (modules_used > 1 && modules[1].bm_type != BMT_FILE)) {
1378 return;
1379 }
1380
1381 if (modules[0].bm_hash != (native_ptr_t)(uintptr_t)NULL ||
1382 (modules_used > 1 &&
1383 modules[1].bm_hash != (native_ptr_t)(uintptr_t)NULL)) {
1384 return;
1385 }
1386
1387 modules[0].bm_type = BMT_ROOTFS;
1388 if (modules_used > 1) {
1389 modules[1].bm_type = BMT_HASH;
1390 modules[1].bm_name = modules[0].bm_name;
1391 }
1392 }
1393
1394 /*
1395 * For modules that do not have assigned hashes but have a separate hash module,
1396 * find the assigned hash module and set the primary module's bm_hash to point
1397 * to the hash data from that module. We will then ignore modules of type
1398 * BMT_HASH from this point forward.
1399 */
1400 static void
assign_module_hashes(void)1401 assign_module_hashes(void)
1402 {
1403 uint_t i, j;
1404
1405 for (i = 0; i < modules_used; i++) {
1406 if (modules[i].bm_type == BMT_HASH ||
1407 modules[i].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1408 continue;
1409 }
1410
1411 for (j = 0; j < modules_used; j++) {
1412 if (modules[j].bm_type != BMT_HASH ||
1413 strcmp((char *)(uintptr_t)modules[j].bm_name,
1414 (char *)(uintptr_t)modules[i].bm_name) != 0) {
1415 continue;
1416 }
1417
1418 if (modules[j].bm_size < SHA1_ASCII_LENGTH) {
1419 dboot_printf("Short hash module of length "
1420 "0x%lx bytes; ignoring\n",
1421 (ulong_t)modules[j].bm_size);
1422 } else {
1423 modules[i].bm_hash = modules[j].bm_addr;
1424 }
1425 break;
1426 }
1427 }
1428 }
1429
1430 /*
1431 * Walk through the module information finding the last used address.
1432 * The first available address will become the top level page table.
1433 */
1434 static void
dboot_process_modules(void)1435 dboot_process_modules(void)
1436 {
1437 int i, modcount;
1438 extern char _end[];
1439
1440 DBG_MSG("\nFinding Modules\n");
1441 modcount = dboot_multiboot_modcount();
1442 if (modcount > MAX_BOOT_MODULES) {
1443 dboot_panic("Too many modules (%d) -- the maximum is %d.",
1444 modcount, MAX_BOOT_MODULES);
1445 }
1446 /*
1447 * search the modules to find the last used address
1448 * we'll build the module list while we're walking through here
1449 */
1450 check_higher((paddr_t)(uintptr_t)&_end);
1451 for (i = 0; i < modcount; ++i) {
1452 process_module(i);
1453 modules_used++;
1454 }
1455 bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1456 DBG(bi->bi_modules);
1457 bi->bi_module_cnt = modcount;
1458 DBG(bi->bi_module_cnt);
1459
1460 fixup_modules();
1461 assign_module_hashes();
1462 check_images();
1463 }
1464
1465 /*
1466 * We then build the phys_install memlist from the multiboot information.
1467 */
1468 static void
dboot_process_mmap(void)1469 dboot_process_mmap(void)
1470 {
1471 uint64_t start;
1472 uint64_t end;
1473 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
1474 uint32_t lower, upper;
1475 int i, mmap_entries;
1476
1477 /*
1478 * Walk through the memory map from multiboot and build our memlist
1479 * structures. Note these will have native format pointers.
1480 */
1481 DBG_MSG("\nFinding Memory Map\n");
1482 num_entries = 0;
1483 num_entries_set = B_FALSE;
1484 max_mem = 0;
1485 if ((mmap_entries = dboot_loader_mmap_entries()) > 0) {
1486 for (i = 0; i < mmap_entries; i++) {
1487 uint32_t type = dboot_loader_mmap_get_type(i);
1488 start = dboot_loader_mmap_get_base(i);
1489 end = start + dboot_loader_mmap_get_length(i);
1490
1491 if (prom_debug)
1492 dboot_printf("\ttype: %d %" PRIx64 "..%"
1493 PRIx64 "\n", type, start, end);
1494
1495 /*
1496 * page align start and end
1497 */
1498 start = (start + page_offset) & ~page_offset;
1499 end &= ~page_offset;
1500 if (end <= start)
1501 continue;
1502
1503 /*
1504 * only type 1 is usable RAM
1505 */
1506 switch (type) {
1507 case 1:
1508 if (end > max_mem)
1509 max_mem = end;
1510 memlists[memlists_used].addr = start;
1511 memlists[memlists_used].size = end - start;
1512 ++memlists_used;
1513 if (memlists_used > MAX_MEMLIST)
1514 dboot_panic("too many memlists");
1515 break;
1516 case 2:
1517 rsvdmemlists[rsvdmemlists_used].addr = start;
1518 rsvdmemlists[rsvdmemlists_used].size =
1519 end - start;
1520 ++rsvdmemlists_used;
1521 if (rsvdmemlists_used > MAX_MEMLIST)
1522 dboot_panic("too many rsvdmemlists");
1523 break;
1524 default:
1525 continue;
1526 }
1527 }
1528 build_pcimemlists();
1529 } else if (dboot_multiboot_basicmeminfo(&lower, &upper)) {
1530 DBG(lower);
1531 memlists[memlists_used].addr = 0;
1532 memlists[memlists_used].size = lower * 1024;
1533 ++memlists_used;
1534 DBG(upper);
1535 memlists[memlists_used].addr = 1024 * 1024;
1536 memlists[memlists_used].size = upper * 1024;
1537 ++memlists_used;
1538
1539 /*
1540 * Old platform - assume I/O space at the end of memory.
1541 */
1542 pcimemlists[0].addr = (upper * 1024) + (1024 * 1024);
1543 pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
1544 pcimemlists[0].next = 0;
1545 pcimemlists[0].prev = 0;
1546 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
1547 DBG(bi->bi_pcimem);
1548 } else {
1549 dboot_panic("No memory info from boot loader!!!");
1550 }
1551
1552 /*
1553 * finish processing the physinstall list
1554 */
1555 sort_physinstall();
1556
1557 /*
1558 * build bios reserved mem lists
1559 */
1560 build_rsvdmemlists();
1561 }
1562
1563 /*
1564 * The highest address is used as the starting point for dboot's simple
1565 * memory allocator.
1566 *
1567 * Finding the highest address in case of Multiboot 1 protocol is
1568 * quite painful in the sense that some information provided by
1569 * the multiboot info structure points to BIOS data, and some to RAM.
1570 *
1571 * The module list was processed and checked already by dboot_process_modules(),
1572 * so we will check the command line string and the memory map.
1573 *
1574 * This list of to be checked items is based on our current knowledge of
1575 * allocations made by grub1 and will need to be reviewed if there
1576 * are updates about the information provided by Multiboot 1.
1577 *
1578 * In the case of the Multiboot 2, our life is much simpler, as the MB2
1579 * information tag list is one contiguous chunk of memory.
1580 */
1581 static paddr_t
dboot_multiboot1_highest_addr(void)1582 dboot_multiboot1_highest_addr(void)
1583 {
1584 paddr_t addr = (paddr_t)(uintptr_t)NULL;
1585 char *cmdl = (char *)mb_info->cmdline;
1586
1587 if (mb_info->flags & MB_INFO_CMDLINE)
1588 addr = ((paddr_t)((uintptr_t)cmdl + strlen(cmdl) + 1));
1589
1590 if (mb_info->flags & MB_INFO_MEM_MAP)
1591 addr = MAX(addr,
1592 ((paddr_t)(mb_info->mmap_addr + mb_info->mmap_length)));
1593 return (addr);
1594 }
1595
1596 static void
dboot_multiboot_highest_addr(void)1597 dboot_multiboot_highest_addr(void)
1598 {
1599 paddr_t addr;
1600
1601 switch (multiboot_version) {
1602 case 1:
1603 addr = dboot_multiboot1_highest_addr();
1604 if (addr != (paddr_t)(uintptr_t)NULL)
1605 check_higher(addr);
1606 break;
1607 case 2:
1608 addr = dboot_multiboot2_highest_addr(mb2_info);
1609 if (addr != (paddr_t)(uintptr_t)NULL)
1610 check_higher(addr);
1611 break;
1612 default:
1613 dboot_panic("Unknown multiboot version: %d\n",
1614 multiboot_version);
1615 break;
1616 }
1617 }
1618
1619 /*
1620 * Walk the boot loader provided information and find the highest free address.
1621 */
1622 static void
init_mem_alloc(void)1623 init_mem_alloc(void)
1624 {
1625 DBG_MSG("Entered init_mem_alloc()\n");
1626 dboot_process_modules();
1627 dboot_process_mmap();
1628 dboot_multiboot_highest_addr();
1629 }
1630
1631 static int
dboot_same_guids(efi_guid_t * g1,efi_guid_t * g2)1632 dboot_same_guids(efi_guid_t *g1, efi_guid_t *g2)
1633 {
1634 int i;
1635
1636 if (g1->time_low != g2->time_low)
1637 return (0);
1638 if (g1->time_mid != g2->time_mid)
1639 return (0);
1640 if (g1->time_hi_and_version != g2->time_hi_and_version)
1641 return (0);
1642 if (g1->clock_seq_hi_and_reserved != g2->clock_seq_hi_and_reserved)
1643 return (0);
1644 if (g1->clock_seq_low != g2->clock_seq_low)
1645 return (0);
1646
1647 for (i = 0; i < 6; i++) {
1648 if (g1->node_addr[i] != g2->node_addr[i])
1649 return (0);
1650 }
1651 return (1);
1652 }
1653
1654 static void
process_efi32(EFI_SYSTEM_TABLE32 * efi)1655 process_efi32(EFI_SYSTEM_TABLE32 *efi)
1656 {
1657 uint32_t entries;
1658 EFI_CONFIGURATION_TABLE32 *config;
1659 efi_guid_t VendorGuid;
1660 int i;
1661
1662 entries = efi->NumberOfTableEntries;
1663 config = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1664 efi->ConfigurationTable;
1665
1666 for (i = 0; i < entries; i++) {
1667 (void) memcpy(&VendorGuid, &config[i].VendorGuid,
1668 sizeof (VendorGuid));
1669 if (dboot_same_guids(&VendorGuid, &smbios3)) {
1670 bi->bi_smbios = (native_ptr_t)(uintptr_t)
1671 config[i].VendorTable;
1672 }
1673 if (bi->bi_smbios == 0 &&
1674 dboot_same_guids(&VendorGuid, &smbios)) {
1675 bi->bi_smbios = (native_ptr_t)(uintptr_t)
1676 config[i].VendorTable;
1677 }
1678 if (dboot_same_guids(&VendorGuid, &acpi2)) {
1679 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1680 config[i].VendorTable;
1681 }
1682 if (bi->bi_acpi_rsdp == 0 &&
1683 dboot_same_guids(&VendorGuid, &acpi1)) {
1684 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1685 config[i].VendorTable;
1686 }
1687 }
1688 }
1689
1690 static void
process_efi64(EFI_SYSTEM_TABLE64 * efi)1691 process_efi64(EFI_SYSTEM_TABLE64 *efi)
1692 {
1693 uint64_t entries;
1694 EFI_CONFIGURATION_TABLE64 *config;
1695 efi_guid_t VendorGuid;
1696 int i;
1697
1698 entries = efi->NumberOfTableEntries;
1699 config = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1700 efi->ConfigurationTable;
1701
1702 for (i = 0; i < entries; i++) {
1703 (void) memcpy(&VendorGuid, &config[i].VendorGuid,
1704 sizeof (VendorGuid));
1705 if (dboot_same_guids(&VendorGuid, &smbios3)) {
1706 bi->bi_smbios = (native_ptr_t)(uintptr_t)
1707 config[i].VendorTable;
1708 }
1709 if (bi->bi_smbios == 0 &&
1710 dboot_same_guids(&VendorGuid, &smbios)) {
1711 bi->bi_smbios = (native_ptr_t)(uintptr_t)
1712 config[i].VendorTable;
1713 }
1714 /* Prefer acpi v2+ over v1. */
1715 if (dboot_same_guids(&VendorGuid, &acpi2)) {
1716 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1717 config[i].VendorTable;
1718 }
1719 if (bi->bi_acpi_rsdp == 0 &&
1720 dboot_same_guids(&VendorGuid, &acpi1)) {
1721 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1722 config[i].VendorTable;
1723 }
1724 }
1725 }
1726
1727 static void
dboot_multiboot_get_fwtables(void)1728 dboot_multiboot_get_fwtables(void)
1729 {
1730 multiboot_tag_new_acpi_t *nacpitagp;
1731 multiboot_tag_old_acpi_t *oacpitagp;
1732 multiboot_tag_efi64_t *efi64tagp = NULL;
1733 multiboot_tag_efi32_t *efi32tagp = NULL;
1734
1735 /* no fw tables from multiboot 1 */
1736 if (multiboot_version != 2)
1737 return;
1738
1739 efi64tagp = (multiboot_tag_efi64_t *)
1740 dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_EFI64);
1741 if (efi64tagp != NULL) {
1742 bi->bi_uefi_arch = XBI_UEFI_ARCH_64;
1743 bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1744 efi64tagp->mb_pointer;
1745 process_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
1746 efi64tagp->mb_pointer);
1747 } else {
1748 efi32tagp = (multiboot_tag_efi32_t *)
1749 dboot_multiboot2_find_tag(mb2_info,
1750 MULTIBOOT_TAG_TYPE_EFI32);
1751 if (efi32tagp != NULL) {
1752 bi->bi_uefi_arch = XBI_UEFI_ARCH_32;
1753 bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1754 efi32tagp->mb_pointer;
1755 process_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
1756 efi32tagp->mb_pointer);
1757 }
1758 }
1759
1760 /*
1761 * The multiboot2 info contains a copy of the RSDP; stash a pointer to
1762 * it (see find_rsdp() in fakebop).
1763 */
1764 nacpitagp = (multiboot_tag_new_acpi_t *)
1765 dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_ACPI_NEW);
1766 oacpitagp = (multiboot_tag_old_acpi_t *)
1767 dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_ACPI_OLD);
1768
1769 if (nacpitagp != NULL) {
1770 bi->bi_acpi_rsdp_copy = (native_ptr_t)(uintptr_t)
1771 &nacpitagp->mb_rsdp[0];
1772 } else if (oacpitagp != NULL) {
1773 bi->bi_acpi_rsdp_copy = (native_ptr_t)(uintptr_t)
1774 &oacpitagp->mb_rsdp[0];
1775 }
1776 }
1777
1778 /* print out EFI version string with newline */
1779 static void
dboot_print_efi_version(uint32_t ver)1780 dboot_print_efi_version(uint32_t ver)
1781 {
1782 int rev;
1783
1784 dboot_printf("%d.", EFI_REV_MAJOR(ver));
1785
1786 rev = EFI_REV_MINOR(ver);
1787 if ((rev % 10) != 0) {
1788 dboot_printf("%d.%d\n", rev / 10, rev % 10);
1789 } else {
1790 dboot_printf("%d\n", rev / 10);
1791 }
1792 }
1793
1794 static void
print_efi32(EFI_SYSTEM_TABLE32 * efi)1795 print_efi32(EFI_SYSTEM_TABLE32 *efi)
1796 {
1797 uint16_t *data;
1798 EFI_CONFIGURATION_TABLE32 *conf;
1799 int i;
1800
1801 dboot_printf("EFI32 signature: %llx\n",
1802 (unsigned long long)efi->Hdr.Signature);
1803 dboot_printf("EFI system version: ");
1804 dboot_print_efi_version(efi->Hdr.Revision);
1805 dboot_printf("EFI system vendor: ");
1806 data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1807 for (i = 0; data[i] != 0; i++)
1808 dboot_printf("%c", (char)data[i]);
1809 dboot_printf("\nEFI firmware revision: ");
1810 dboot_print_efi_version(efi->FirmwareRevision);
1811 dboot_printf("EFI system table number of entries: %d\n",
1812 efi->NumberOfTableEntries);
1813 conf = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1814 efi->ConfigurationTable;
1815 for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1816 dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1817 conf[i].VendorGuid.time_low,
1818 conf[i].VendorGuid.time_mid,
1819 conf[i].VendorGuid.time_hi_and_version,
1820 conf[i].VendorGuid.clock_seq_hi_and_reserved,
1821 conf[i].VendorGuid.clock_seq_low);
1822 dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1823 conf[i].VendorGuid.node_addr[0],
1824 conf[i].VendorGuid.node_addr[1],
1825 conf[i].VendorGuid.node_addr[2],
1826 conf[i].VendorGuid.node_addr[3],
1827 conf[i].VendorGuid.node_addr[4],
1828 conf[i].VendorGuid.node_addr[5]);
1829 }
1830 }
1831
1832 static void
print_efi64(EFI_SYSTEM_TABLE64 * efi)1833 print_efi64(EFI_SYSTEM_TABLE64 *efi)
1834 {
1835 uint16_t *data;
1836 EFI_CONFIGURATION_TABLE64 *conf;
1837 int i;
1838
1839 dboot_printf("EFI64 signature: %llx\n",
1840 (unsigned long long)efi->Hdr.Signature);
1841 dboot_printf("EFI system version: ");
1842 dboot_print_efi_version(efi->Hdr.Revision);
1843 dboot_printf("EFI system vendor: ");
1844 data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1845 for (i = 0; data[i] != 0; i++)
1846 dboot_printf("%c", (char)data[i]);
1847 dboot_printf("\nEFI firmware revision: ");
1848 dboot_print_efi_version(efi->FirmwareRevision);
1849 dboot_printf("EFI system table number of entries: %" PRIu64 "\n",
1850 efi->NumberOfTableEntries);
1851 conf = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1852 efi->ConfigurationTable;
1853 for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1854 dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1855 conf[i].VendorGuid.time_low,
1856 conf[i].VendorGuid.time_mid,
1857 conf[i].VendorGuid.time_hi_and_version,
1858 conf[i].VendorGuid.clock_seq_hi_and_reserved,
1859 conf[i].VendorGuid.clock_seq_low);
1860 dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1861 conf[i].VendorGuid.node_addr[0],
1862 conf[i].VendorGuid.node_addr[1],
1863 conf[i].VendorGuid.node_addr[2],
1864 conf[i].VendorGuid.node_addr[3],
1865 conf[i].VendorGuid.node_addr[4],
1866 conf[i].VendorGuid.node_addr[5]);
1867 }
1868 }
1869 #endif /* !__xpv */
1870
1871 /*
1872 * Simple memory allocator, allocates aligned physical memory.
1873 * Note that startup_kernel() only allocates memory, never frees.
1874 * Memory usage just grows in an upward direction.
1875 */
1876 static void *
do_mem_alloc(uint32_t size,uint32_t align)1877 do_mem_alloc(uint32_t size, uint32_t align)
1878 {
1879 uint_t i;
1880 uint64_t best;
1881 uint64_t start;
1882 uint64_t end;
1883
1884 /*
1885 * make sure size is a multiple of pagesize
1886 */
1887 size = RNDUP(size, MMU_PAGESIZE);
1888 next_avail_addr = RNDUP(next_avail_addr, align);
1889
1890 /*
1891 * XXPV fixme joe
1892 *
1893 * a really large bootarchive that causes you to run out of memory
1894 * may cause this to blow up
1895 */
1896 /* LINTED E_UNEXPECTED_UINT_PROMOTION */
1897 best = (uint64_t)-size;
1898 for (i = 0; i < memlists_used; ++i) {
1899 start = memlists[i].addr;
1900 #if defined(__xpv)
1901 start += mfn_base;
1902 #endif
1903 end = start + memlists[i].size;
1904
1905 /*
1906 * did we find the desired address?
1907 */
1908 if (start <= next_avail_addr && next_avail_addr + size <= end) {
1909 best = next_avail_addr;
1910 goto done;
1911 }
1912
1913 /*
1914 * if not is this address the best so far?
1915 */
1916 if (start > next_avail_addr && start < best &&
1917 RNDUP(start, align) + size <= end)
1918 best = RNDUP(start, align);
1919 }
1920
1921 /*
1922 * We didn't find exactly the address we wanted, due to going off the
1923 * end of a memory region. Return the best found memory address.
1924 */
1925 done:
1926 next_avail_addr = best + size;
1927 #if defined(__xpv)
1928 if (next_avail_addr > scratch_end)
1929 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
1930 "0x%lx", (ulong_t)next_avail_addr,
1931 (ulong_t)scratch_end);
1932 #endif
1933 (void) memset((void *)(uintptr_t)best, 0, size);
1934 return ((void *)(uintptr_t)best);
1935 }
1936
1937 void *
mem_alloc(uint32_t size)1938 mem_alloc(uint32_t size)
1939 {
1940 return (do_mem_alloc(size, MMU_PAGESIZE));
1941 }
1942
1943
1944 /*
1945 * Build page tables to map all of memory used so far as well as the kernel.
1946 */
1947 static void
build_page_tables(void)1948 build_page_tables(void)
1949 {
1950 uint32_t psize;
1951 uint32_t level;
1952 uint32_t off;
1953 uint64_t start;
1954 #if !defined(__xpv)
1955 uint32_t i;
1956 uint64_t end;
1957 #endif /* __xpv */
1958
1959 /*
1960 * If we're on metal, we need to create the top level pagetable.
1961 */
1962 #if defined(__xpv)
1963 top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1964 #else /* __xpv */
1965 top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1966 #endif /* __xpv */
1967 DBG((uintptr_t)top_page_table);
1968
1969 /*
1970 * Determine if we'll use large mappings for kernel, then map it.
1971 */
1972 if (largepage_support) {
1973 psize = lpagesize;
1974 level = 1;
1975 } else {
1976 psize = MMU_PAGESIZE;
1977 level = 0;
1978 }
1979
1980 DBG_MSG("Mapping kernel\n");
1981 DBG(ktext_phys);
1982 DBG(target_kernel_text);
1983 DBG(ksize);
1984 DBG(psize);
1985 for (off = 0; off < ksize; off += psize)
1986 map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1987
1988 /*
1989 * The kernel will need a 1 page window to work with page tables
1990 */
1991 bi->bi_pt_window = (native_ptr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1992 DBG(bi->bi_pt_window);
1993 bi->bi_pte_to_pt_window =
1994 (native_ptr_t)(uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1995 DBG(bi->bi_pte_to_pt_window);
1996
1997 #if defined(__xpv)
1998 if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1999 /* If this is a domU we're done. */
2000 DBG_MSG("\nPage tables constructed\n");
2001 return;
2002 }
2003 #endif /* __xpv */
2004
2005 /*
2006 * We need 1:1 mappings for the lower 1M of memory to access
2007 * BIOS tables used by a couple of drivers during boot.
2008 *
2009 * The following code works because our simple memory allocator
2010 * only grows usage in an upwards direction.
2011 *
2012 * Note that by this point in boot some mappings for low memory
2013 * may already exist because we've already accessed device in low
2014 * memory. (Specifically the video frame buffer and keyboard
2015 * status ports.) If we're booting on raw hardware then GRUB
2016 * created these mappings for us. If we're booting under a
2017 * hypervisor then we went ahead and remapped these devices into
2018 * memory allocated within dboot itself.
2019 */
2020 if (map_debug)
2021 dboot_printf("1:1 map pa=0..1Meg\n");
2022 for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
2023 #if defined(__xpv)
2024 map_ma_at_va(start, start, 0);
2025 #else /* __xpv */
2026 map_pa_at_va(start, start, 0);
2027 #endif /* __xpv */
2028 }
2029
2030 #if !defined(__xpv)
2031
2032 for (i = 0; i < memlists_used; ++i) {
2033 start = memlists[i].addr;
2034 end = start + memlists[i].size;
2035
2036 if (map_debug)
2037 dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
2038 start, end);
2039 while (start < end && start < next_avail_addr) {
2040 map_pa_at_va(start, start, 0);
2041 start += MMU_PAGESIZE;
2042 }
2043 if (start >= next_avail_addr)
2044 break;
2045 }
2046
2047 /*
2048 * Map framebuffer memory as PT_NOCACHE as this is memory from a
2049 * device and therefore must not be cached.
2050 */
2051 if (fb != NULL && fb->framebuffer != 0) {
2052 multiboot_tag_framebuffer_t *fb_tagp;
2053 fb_tagp = (multiboot_tag_framebuffer_t *)(uintptr_t)
2054 fb->framebuffer;
2055
2056 start = fb_tagp->framebuffer_common.framebuffer_addr;
2057 end = start + fb_tagp->framebuffer_common.framebuffer_height *
2058 fb_tagp->framebuffer_common.framebuffer_pitch;
2059
2060 if (map_debug)
2061 dboot_printf("FB 1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
2062 start, end);
2063 pte_bits |= PT_NOCACHE;
2064 if (PAT_support != 0)
2065 pte_bits |= PT_PAT_4K;
2066
2067 while (start < end) {
2068 map_pa_at_va(start, start, 0);
2069 start += MMU_PAGESIZE;
2070 }
2071 pte_bits &= ~PT_NOCACHE;
2072 if (PAT_support != 0)
2073 pte_bits &= ~PT_PAT_4K;
2074 }
2075 #endif /* !__xpv */
2076
2077 DBG_MSG("\nPage tables constructed\n");
2078 }
2079
2080 #define NO_MULTIBOOT \
2081 "multiboot is no longer used to boot the Solaris Operating System.\n\
2082 The grub entry should be changed to:\n\
2083 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
2084 module$ /platform/i86pc/$ISADIR/boot_archive\n\
2085 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
2086
2087 static void
dboot_init_xboot_consinfo(void)2088 dboot_init_xboot_consinfo(void)
2089 {
2090 bi = &boot_info;
2091
2092 #if !defined(__xpv)
2093 fb = &framebuffer;
2094 bi->bi_framebuffer = (native_ptr_t)(uintptr_t)fb;
2095
2096 switch (multiboot_version) {
2097 case 1:
2098 dboot_multiboot1_xboot_consinfo();
2099 break;
2100 case 2:
2101 dboot_multiboot2_xboot_consinfo();
2102 break;
2103 default:
2104 dboot_panic("Unknown multiboot version: %d\n",
2105 multiboot_version);
2106 break;
2107 }
2108 dboot_find_console_modules();
2109 #endif
2110 }
2111
2112 /*
2113 * Set up basic data from the boot loader.
2114 * The load_addr is part of AOUT kludge setup in dboot_grub.s, to support
2115 * 32-bit dboot code setup used to set up and start 64-bit kernel.
2116 * AOUT kludge does allow 32-bit boot loader, such as grub1, to load and
2117 * start 64-bit illumos kernel.
2118 */
2119 static void
dboot_loader_init(void)2120 dboot_loader_init(void)
2121 {
2122 #if !defined(__xpv)
2123 mb_info = NULL;
2124 mb2_info = NULL;
2125
2126 switch (mb_magic) {
2127 case MB_BOOTLOADER_MAGIC:
2128 multiboot_version = 1;
2129 mb_info = (multiboot_info_t *)(uintptr_t)mb_addr;
2130 #if defined(_BOOT_TARGET_amd64)
2131 load_addr = mb_header.load_addr;
2132 #endif
2133 break;
2134
2135 case MULTIBOOT2_BOOTLOADER_MAGIC:
2136 multiboot_version = 2;
2137 mb2_info = (multiboot2_info_header_t *)(uintptr_t)mb_addr;
2138 mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mb2_info);
2139 #if defined(_BOOT_TARGET_amd64)
2140 load_addr = mb2_load_addr;
2141 #endif
2142 break;
2143
2144 default:
2145 dboot_panic("Unknown bootloader magic: 0x%x\n", mb_magic);
2146 break;
2147 }
2148 #endif /* !defined(__xpv) */
2149 }
2150
2151 /* Extract the kernel command line from [multi]boot information. */
2152 static char *
dboot_loader_cmdline(void)2153 dboot_loader_cmdline(void)
2154 {
2155 char *line = NULL;
2156
2157 #if defined(__xpv)
2158 line = (char *)xen_info->cmd_line;
2159 #else /* __xpv */
2160
2161 switch (multiboot_version) {
2162 case 1:
2163 if (mb_info->flags & MB_INFO_CMDLINE)
2164 line = (char *)mb_info->cmdline;
2165 break;
2166
2167 case 2:
2168 line = dboot_multiboot2_cmdline(mb2_info);
2169 break;
2170
2171 default:
2172 dboot_panic("Unknown multiboot version: %d\n",
2173 multiboot_version);
2174 break;
2175 }
2176
2177 #endif /* __xpv */
2178
2179 /*
2180 * Make sure we have valid pointer so the string operations
2181 * will not crash us.
2182 */
2183 if (line == NULL)
2184 line = "";
2185
2186 return (line);
2187 }
2188
2189 static char *
dboot_loader_name(void)2190 dboot_loader_name(void)
2191 {
2192 #if defined(__xpv)
2193 return (NULL);
2194 #else /* __xpv */
2195 multiboot_tag_string_t *tag;
2196
2197 switch (multiboot_version) {
2198 case 1:
2199 return ((char *)(uintptr_t)mb_info->boot_loader_name);
2200
2201 case 2:
2202 tag = dboot_multiboot2_find_tag(mb2_info,
2203 MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME);
2204 return (tag->mb_string);
2205 default:
2206 dboot_panic("Unknown multiboot version: %d\n",
2207 multiboot_version);
2208 break;
2209 }
2210
2211 return (NULL);
2212 #endif /* __xpv */
2213 }
2214
2215 /*
2216 * startup_kernel has a pretty simple job. It builds pagetables which reflect
2217 * 1:1 mappings for all memory in use. It then also adds mappings for
2218 * the kernel nucleus at virtual address of target_kernel_text using large page
2219 * mappings. The page table pages are also accessible at 1:1 mapped
2220 * virtual addresses.
2221 */
2222 /*ARGSUSED*/
2223 void
startup_kernel(void)2224 startup_kernel(void)
2225 {
2226 char *cmdline;
2227 char *bootloader;
2228 #if defined(__xpv)
2229 physdev_set_iopl_t set_iopl;
2230 #endif /* __xpv */
2231
2232 if (dboot_debug == 1)
2233 bcons_init(NULL); /* Set very early console to ttya. */
2234 dboot_loader_init();
2235 /*
2236 * At this point we are executing in a 32 bit real mode.
2237 */
2238
2239 bootloader = dboot_loader_name();
2240 cmdline = dboot_loader_cmdline();
2241
2242 #if defined(__xpv)
2243 /*
2244 * For dom0, before we initialize the console subsystem we'll
2245 * need to enable io operations, so set I/O priveldge level to 1.
2246 */
2247 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
2248 set_iopl.iopl = 1;
2249 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
2250 }
2251 #endif /* __xpv */
2252
2253 dboot_init_xboot_consinfo();
2254 bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
2255 bcons_init(bi); /* Now we can set the real console. */
2256
2257 prom_debug = (find_boot_prop("prom_debug") != NULL);
2258 map_debug = (find_boot_prop("map_debug") != NULL);
2259
2260 #if !defined(__xpv)
2261 dboot_multiboot_get_fwtables();
2262 #endif
2263 DBG_MSG("\n\nillumos prekernel set: ");
2264 DBG_MSG(cmdline);
2265 DBG_MSG("\n");
2266
2267 if (bootloader != NULL && prom_debug) {
2268 dboot_printf("Kernel loaded by: %s\n", bootloader);
2269 #if !defined(__xpv)
2270 dboot_printf("Using multiboot %d boot protocol.\n",
2271 multiboot_version);
2272 #endif
2273 }
2274
2275 if (strstr(cmdline, "multiboot") != NULL) {
2276 dboot_panic(NO_MULTIBOOT);
2277 }
2278
2279 DBG((uintptr_t)bi);
2280 #if !defined(__xpv)
2281 DBG((uintptr_t)mb_info);
2282 DBG((uintptr_t)mb2_info);
2283 if (mb2_info != NULL)
2284 DBG(mb2_info->mbi_total_size);
2285 DBG(bi->bi_acpi_rsdp);
2286 DBG(bi->bi_acpi_rsdp_copy);
2287 DBG(bi->bi_smbios);
2288 DBG(bi->bi_uefi_arch);
2289 DBG(bi->bi_uefi_systab);
2290
2291 if (bi->bi_uefi_systab && prom_debug) {
2292 if (bi->bi_uefi_arch == XBI_UEFI_ARCH_64) {
2293 print_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
2294 bi->bi_uefi_systab);
2295 } else {
2296 print_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
2297 bi->bi_uefi_systab);
2298 }
2299 }
2300 #endif
2301
2302 /*
2303 * Need correct target_kernel_text value
2304 */
2305 target_kernel_text = KERNEL_TEXT;
2306 DBG(target_kernel_text);
2307
2308 #if defined(__xpv)
2309
2310 /*
2311 * XXPV Derive this stuff from CPUID / what the hypervisor has enabled
2312 */
2313
2314 #if defined(_BOOT_TARGET_amd64)
2315 /*
2316 * 64-bit hypervisor.
2317 */
2318 amd64_support = 1;
2319 pae_support = 1;
2320
2321 #else /* _BOOT_TARGET_amd64 */
2322
2323 /*
2324 * See if we are running on a PAE Hypervisor
2325 */
2326 {
2327 xen_capabilities_info_t caps;
2328
2329 if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
2330 dboot_panic("HYPERVISOR_xen_version(caps) failed");
2331 caps[sizeof (caps) - 1] = 0;
2332 if (prom_debug)
2333 dboot_printf("xen capabilities %s\n", caps);
2334 if (strstr(caps, "x86_32p") != NULL)
2335 pae_support = 1;
2336 }
2337
2338 #endif /* _BOOT_TARGET_amd64 */
2339 {
2340 xen_platform_parameters_t p;
2341
2342 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
2343 dboot_panic("HYPERVISOR_xen_version(parms) failed");
2344 DBG(p.virt_start);
2345 mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
2346 }
2347
2348 /*
2349 * The hypervisor loads stuff starting at 1Gig
2350 */
2351 mfn_base = ONE_GIG;
2352 DBG(mfn_base);
2353
2354 /*
2355 * enable writable page table mode for the hypervisor
2356 */
2357 if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2358 VMASST_TYPE_writable_pagetables) < 0)
2359 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
2360
2361 /*
2362 * check for NX support
2363 */
2364 if (pae_support) {
2365 uint32_t eax = 0x80000000;
2366 uint32_t edx = get_cpuid_edx(&eax);
2367
2368 if (eax >= 0x80000001) {
2369 eax = 0x80000001;
2370 edx = get_cpuid_edx(&eax);
2371 if (edx & CPUID_AMD_EDX_NX)
2372 NX_support = 1;
2373 }
2374 }
2375
2376 /*
2377 * check for PAT support
2378 */
2379 {
2380 uint32_t eax = 1;
2381 uint32_t edx = get_cpuid_edx(&eax);
2382
2383 if (edx & CPUID_INTC_EDX_PAT)
2384 PAT_support = 1;
2385 }
2386 #if !defined(_BOOT_TARGET_amd64)
2387
2388 /*
2389 * The 32-bit hypervisor uses segmentation to protect itself from
2390 * guests. This means when a guest attempts to install a flat 4GB
2391 * code or data descriptor the 32-bit hypervisor will protect itself
2392 * by silently shrinking the segment such that if the guest attempts
2393 * any access where the hypervisor lives a #gp fault is generated.
2394 * The problem is that some applications expect a full 4GB flat
2395 * segment for their current thread pointer and will use negative
2396 * offset segment wrap around to access data. TLS support in linux
2397 * brand is one example of this.
2398 *
2399 * The 32-bit hypervisor can catch the #gp fault in these cases
2400 * and emulate the access without passing the #gp fault to the guest
2401 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
2402 * Seems like this should have been the default.
2403 * Either way, we want the hypervisor -- and not Solaris -- to deal
2404 * to deal with emulating these accesses.
2405 */
2406 if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2407 VMASST_TYPE_4gb_segments) < 0)
2408 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
2409 #endif /* !_BOOT_TARGET_amd64 */
2410
2411 #else /* __xpv */
2412
2413 /*
2414 * use cpuid to enable MMU features
2415 */
2416 if (have_cpuid()) {
2417 uint32_t eax, edx;
2418
2419 eax = 1;
2420 edx = get_cpuid_edx(&eax);
2421 if (edx & CPUID_INTC_EDX_PSE)
2422 largepage_support = 1;
2423 if (edx & CPUID_INTC_EDX_PGE)
2424 pge_support = 1;
2425 if (edx & CPUID_INTC_EDX_PAE)
2426 pae_support = 1;
2427 if (edx & CPUID_INTC_EDX_PAT)
2428 PAT_support = 1;
2429
2430 eax = 0x80000000;
2431 edx = get_cpuid_edx(&eax);
2432 if (eax >= 0x80000001) {
2433 eax = 0x80000001;
2434 edx = get_cpuid_edx(&eax);
2435 if (edx & CPUID_AMD_EDX_LM)
2436 amd64_support = 1;
2437 if (edx & CPUID_AMD_EDX_NX)
2438 NX_support = 1;
2439 }
2440 } else {
2441 dboot_printf("cpuid not supported\n");
2442 }
2443 #endif /* __xpv */
2444
2445
2446 #if defined(_BOOT_TARGET_amd64)
2447 if (amd64_support == 0)
2448 dboot_panic("long mode not supported, rebooting");
2449 else if (pae_support == 0)
2450 dboot_panic("long mode, but no PAE; rebooting");
2451 #else
2452 /*
2453 * Allow the command line to over-ride use of PAE for 32 bit.
2454 */
2455 if (strstr(cmdline, "disablePAE=true") != NULL) {
2456 pae_support = 0;
2457 NX_support = 0;
2458 amd64_support = 0;
2459 }
2460 #endif
2461
2462 /*
2463 * initialize the simple memory allocator
2464 */
2465 init_mem_alloc();
2466
2467 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
2468 /*
2469 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
2470 */
2471 if (max_mem < FOUR_GIG && NX_support == 0)
2472 pae_support = 0;
2473 #endif
2474
2475 /*
2476 * configure mmu information
2477 */
2478 if (pae_support) {
2479 shift_amt = shift_amt_pae;
2480 ptes_per_table = 512;
2481 pte_size = 8;
2482 lpagesize = TWO_MEG;
2483 #if defined(_BOOT_TARGET_amd64)
2484 top_level = 3;
2485 #else
2486 top_level = 2;
2487 #endif
2488 } else {
2489 pae_support = 0;
2490 NX_support = 0;
2491 shift_amt = shift_amt_nopae;
2492 ptes_per_table = 1024;
2493 pte_size = 4;
2494 lpagesize = FOUR_MEG;
2495 top_level = 1;
2496 }
2497
2498 DBG(PAT_support);
2499 DBG(pge_support);
2500 DBG(NX_support);
2501 DBG(largepage_support);
2502 DBG(amd64_support);
2503 DBG(top_level);
2504 DBG(pte_size);
2505 DBG(ptes_per_table);
2506 DBG(lpagesize);
2507
2508 #if defined(__xpv)
2509 ktext_phys = ONE_GIG; /* from UNIX Mapfile */
2510 #else
2511 ktext_phys = FOUR_MEG; /* from UNIX Mapfile */
2512 #endif
2513
2514 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
2515 /*
2516 * For grub, copy kernel bits from the ELF64 file to final place.
2517 */
2518 DBG_MSG("\nAllocating nucleus pages.\n");
2519 ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
2520
2521 if (ktext_phys == 0)
2522 dboot_panic("failed to allocate aligned kernel memory");
2523 DBG(load_addr);
2524 if (dboot_elfload64(load_addr) != 0)
2525 dboot_panic("failed to parse kernel ELF image, rebooting");
2526 #endif
2527
2528 DBG(ktext_phys);
2529
2530 /*
2531 * Allocate page tables.
2532 */
2533 build_page_tables();
2534
2535 /*
2536 * return to assembly code to switch to running kernel
2537 */
2538 entry_addr_low = (uint32_t)target_kernel_text;
2539 DBG(entry_addr_low);
2540 bi->bi_use_largepage = largepage_support;
2541 bi->bi_use_pae = pae_support;
2542 bi->bi_use_pge = pge_support;
2543 bi->bi_use_nx = NX_support;
2544
2545 #if defined(__xpv)
2546
2547 bi->bi_next_paddr = next_avail_addr - mfn_base;
2548 DBG(bi->bi_next_paddr);
2549 bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2550 DBG(bi->bi_next_vaddr);
2551
2552 /*
2553 * unmap unused pages in start area to make them available for DMA
2554 */
2555 while (next_avail_addr < scratch_end) {
2556 (void) HYPERVISOR_update_va_mapping(next_avail_addr,
2557 0, UVMF_INVLPG | UVMF_LOCAL);
2558 next_avail_addr += MMU_PAGESIZE;
2559 }
2560
2561 bi->bi_xen_start_info = (native_ptr_t)(uintptr_t)xen_info;
2562 DBG((uintptr_t)HYPERVISOR_shared_info);
2563 bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
2564 bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
2565
2566 #else /* __xpv */
2567
2568 bi->bi_next_paddr = next_avail_addr;
2569 DBG(bi->bi_next_paddr);
2570 bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2571 DBG(bi->bi_next_vaddr);
2572 bi->bi_mb_version = multiboot_version;
2573
2574 switch (multiboot_version) {
2575 case 1:
2576 bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb_info;
2577 break;
2578 case 2:
2579 bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb2_info;
2580 break;
2581 default:
2582 dboot_panic("Unknown multiboot version: %d\n",
2583 multiboot_version);
2584 break;
2585 }
2586 bi->bi_top_page_table = (uintptr_t)top_page_table;
2587
2588 #endif /* __xpv */
2589
2590 bi->bi_kseg_size = FOUR_MEG;
2591 DBG(bi->bi_kseg_size);
2592
2593 #ifndef __xpv
2594 if (map_debug)
2595 dump_tables();
2596 #endif
2597
2598 DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
2599
2600 #ifndef __xpv
2601 /* Update boot info with FB data */
2602 fb->cursor.origin.x = fb_info.cursor.origin.x;
2603 fb->cursor.origin.y = fb_info.cursor.origin.y;
2604 fb->cursor.pos.x = fb_info.cursor.pos.x;
2605 fb->cursor.pos.y = fb_info.cursor.pos.y;
2606 fb->cursor.visible = fb_info.cursor.visible;
2607 #endif
2608 }
2609