xref: /illumos-gate/usr/src/uts/i86pc/dboot/dboot_startkern.c (revision 3fb2fe9fdd2e33737038a161631f2ab6d7050ecf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright 2020 Joyent, Inc.
27  */
28 
29 
30 #include <sys/types.h>
31 #include <sys/machparam.h>
32 #include <sys/x86_archext.h>
33 #include <sys/systm.h>
34 #include <sys/mach_mmu.h>
35 #include <sys/multiboot.h>
36 #include <sys/multiboot2.h>
37 #include <sys/multiboot2_impl.h>
38 #include <sys/sysmacros.h>
39 #include <sys/framebuffer.h>
40 #include <sys/sha1.h>
41 #include <util/string.h>
42 #include <util/strtolctype.h>
43 #include <sys/efi.h>
44 
45 /*
46  * Compile time debug knob. We do not have any early mechanism to control it
47  * as the boot is the earliest mechanism we have, and we do not want to have
48  * it being switched on by default.
49  */
50 int dboot_debug = 0;
51 
52 #if defined(__xpv)
53 
54 #include <sys/hypervisor.h>
55 uintptr_t xen_virt_start;
56 pfn_t *mfn_to_pfn_mapping;
57 
58 #else /* !__xpv */
59 
60 extern multiboot_header_t mb_header;
61 extern uint32_t mb2_load_addr;
62 extern int have_cpuid(void);
63 
64 #endif /* !__xpv */
65 
66 #include <sys/inttypes.h>
67 #include <sys/bootinfo.h>
68 #include <sys/mach_mmu.h>
69 #include <sys/boot_console.h>
70 
71 #include "dboot_asm.h"
72 #include "dboot_printf.h"
73 #include "dboot_xboot.h"
74 #include "dboot_elfload.h"
75 
76 #define	SHA1_ASCII_LENGTH	(SHA1_DIGEST_LENGTH * 2)
77 
78 /*
79  * This file contains code that runs to transition us from either a multiboot
80  * compliant loader (32 bit non-paging) or a XPV domain loader to
81  * regular kernel execution. Its task is to setup the kernel memory image
82  * and page tables.
83  *
84  * The code executes as:
85  *	- 32 bits under GRUB (for 32 or 64 bit Solaris)
86  *	- a 32 bit program for the 32-bit PV hypervisor
87  *	- a 64 bit program for the 64-bit PV hypervisor (at least for now)
88  *
89  * Under the PV hypervisor, we must create mappings for any memory beyond the
90  * initial start of day allocation (such as the kernel itself).
91  *
92  * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
93  * Since we are running in real mode, so all such memory is accessible.
94  */
95 
96 /*
97  * Standard bits used in PTE (page level) and PTP (internal levels)
98  */
99 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
100 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
101 
102 /*
103  * This is the target addresses (physical) where the kernel text and data
104  * nucleus pages will be unpacked. On the hypervisor this is actually a
105  * virtual address.
106  */
107 paddr_t ktext_phys;
108 uint32_t ksize = 2 * FOUR_MEG;	/* kernel nucleus is 8Meg */
109 
110 static uint64_t target_kernel_text;	/* value to use for KERNEL_TEXT */
111 
112 /*
113  * The stack is setup in assembler before entering startup_kernel()
114  */
115 char stack_space[STACK_SIZE];
116 
117 /*
118  * Used to track physical memory allocation
119  */
120 static paddr_t next_avail_addr = 0;
121 
122 #if defined(__xpv)
123 /*
124  * Additional information needed for hypervisor memory allocation.
125  * Only memory up to scratch_end is mapped by page tables.
126  * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
127  * to derive a pfn from a pointer, you subtract mfn_base.
128  */
129 
130 static paddr_t scratch_end = 0;	/* we can't write all of mem here */
131 static paddr_t mfn_base;		/* addr corresponding to mfn_list[0] */
132 start_info_t *xen_info;
133 
134 #else	/* __xpv */
135 
136 /*
137  * If on the metal, then we have a multiboot loader.
138  */
139 uint32_t mb_magic;			/* magic from boot loader */
140 uint32_t mb_addr;			/* multiboot info package from loader */
141 int multiboot_version;
142 multiboot_info_t *mb_info;
143 multiboot2_info_header_t *mb2_info;
144 int num_entries;			/* mmap entry count */
145 boolean_t num_entries_set;		/* is mmap entry count set */
146 uintptr_t load_addr;
147 static boot_framebuffer_t framebuffer __aligned(16);
148 static boot_framebuffer_t *fb;
149 
150 /* can not be automatic variables because of alignment */
151 static efi_guid_t smbios3 = SMBIOS3_TABLE_GUID;
152 static efi_guid_t smbios = SMBIOS_TABLE_GUID;
153 static efi_guid_t acpi2 = EFI_ACPI_TABLE_GUID;
154 static efi_guid_t acpi1 = ACPI_10_TABLE_GUID;
155 #endif	/* __xpv */
156 
157 /*
158  * This contains information passed to the kernel
159  */
160 struct xboot_info boot_info __aligned(16);
161 struct xboot_info *bi;
162 
163 /*
164  * Page table and memory stuff.
165  */
166 static paddr_t max_mem;			/* maximum memory address */
167 
168 /*
169  * Information about processor MMU
170  */
171 int amd64_support = 0;
172 int largepage_support = 0;
173 int pae_support = 0;
174 int pge_support = 0;
175 int NX_support = 0;
176 int PAT_support = 0;
177 
178 /*
179  * Low 32 bits of kernel entry address passed back to assembler.
180  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
181  */
182 uint32_t entry_addr_low;
183 
184 /*
185  * Memlists for the kernel. We shouldn't need a lot of these.
186  */
187 #define	MAX_MEMLIST (50)
188 struct boot_memlist memlists[MAX_MEMLIST];
189 uint_t memlists_used = 0;
190 struct boot_memlist pcimemlists[MAX_MEMLIST];
191 uint_t pcimemlists_used = 0;
192 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
193 uint_t rsvdmemlists_used = 0;
194 
195 /*
196  * This should match what's in the bootloader.  It's arbitrary, but GRUB
197  * in particular has limitations on how much space it can use before it
198  * stops working properly.  This should be enough.
199  */
200 struct boot_modules modules[MAX_BOOT_MODULES];
201 uint_t modules_used = 0;
202 
203 #ifdef __xpv
204 /*
205  * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
206  * definition in Xen source.
207  */
208 typedef struct {
209 	uint32_t	base_addr_low;
210 	uint32_t	base_addr_high;
211 	uint32_t	length_low;
212 	uint32_t	length_high;
213 	uint32_t	type;
214 } mmap_t;
215 
216 /*
217  * There is 512KB of scratch area after the boot stack page.
218  * We'll use that for everything except the kernel nucleus pages which are too
219  * big to fit there and are allocated last anyway.
220  */
221 #define	MAXMAPS	100
222 static mmap_t map_buffer[MAXMAPS];
223 #else
224 typedef mb_memory_map_t mmap_t;
225 #endif
226 
227 /*
228  * Debugging macros
229  */
230 uint_t prom_debug = 0;
231 uint_t map_debug = 0;
232 
233 static char noname[2] = "-";
234 
235 /*
236  * Either hypervisor-specific or grub-specific code builds the initial
237  * memlists. This code does the sort/merge/link for final use.
238  */
239 static void
sort_physinstall(void)240 sort_physinstall(void)
241 {
242 	int i;
243 #if !defined(__xpv)
244 	int j;
245 	struct boot_memlist tmp;
246 
247 	/*
248 	 * Now sort the memlists, in case they weren't in order.
249 	 * Yeah, this is a bubble sort; small, simple and easy to get right.
250 	 */
251 	DBG_MSG("Sorting phys-installed list\n");
252 	for (j = memlists_used - 1; j > 0; --j) {
253 		for (i = 0; i < j; ++i) {
254 			if (memlists[i].addr < memlists[i + 1].addr)
255 				continue;
256 			tmp = memlists[i];
257 			memlists[i] = memlists[i + 1];
258 			memlists[i + 1] = tmp;
259 		}
260 	}
261 
262 	/*
263 	 * Merge any memlists that don't have holes between them.
264 	 */
265 	for (i = 0; i <= memlists_used - 1; ++i) {
266 		if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
267 			continue;
268 
269 		if (prom_debug)
270 			dboot_printf(
271 			    "merging mem segs %" PRIx64 "...%" PRIx64
272 			    " w/ %" PRIx64 "...%" PRIx64 "\n",
273 			    memlists[i].addr,
274 			    memlists[i].addr + memlists[i].size,
275 			    memlists[i + 1].addr,
276 			    memlists[i + 1].addr + memlists[i + 1].size);
277 
278 		memlists[i].size += memlists[i + 1].size;
279 		for (j = i + 1; j < memlists_used - 1; ++j)
280 			memlists[j] = memlists[j + 1];
281 		--memlists_used;
282 		DBG(memlists_used);
283 		--i;	/* after merging we need to reexamine, so do this */
284 	}
285 #endif	/* __xpv */
286 
287 	if (prom_debug) {
288 		dboot_printf("\nFinal memlists:\n");
289 		for (i = 0; i < memlists_used; ++i) {
290 			dboot_printf("\t%d: addr=%" PRIx64 " size=%"
291 			    PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
292 		}
293 	}
294 
295 	/*
296 	 * link together the memlists with native size pointers
297 	 */
298 	memlists[0].next = 0;
299 	memlists[0].prev = 0;
300 	for (i = 1; i < memlists_used; ++i) {
301 		memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
302 		memlists[i].next = 0;
303 		memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
304 	}
305 	bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
306 	DBG(bi->bi_phys_install);
307 }
308 
309 /*
310  * build bios reserved memlists
311  */
312 static void
build_rsvdmemlists(void)313 build_rsvdmemlists(void)
314 {
315 	int i;
316 
317 	rsvdmemlists[0].next = 0;
318 	rsvdmemlists[0].prev = 0;
319 	for (i = 1; i < rsvdmemlists_used; ++i) {
320 		rsvdmemlists[i].prev =
321 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
322 		rsvdmemlists[i].next = 0;
323 		rsvdmemlists[i - 1].next =
324 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
325 	}
326 	bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
327 	DBG(bi->bi_rsvdmem);
328 }
329 
330 #if defined(__xpv)
331 
332 /*
333  * halt on the hypervisor after a delay to drain console output
334  */
335 __NORETURN void
dboot_halt(void)336 dboot_halt(void)
337 {
338 	uint_t i = 10000;
339 
340 	while (--i)
341 		(void) HYPERVISOR_yield();
342 	(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
343 	/* never reached */
344 	for (;;)
345 		;
346 }
347 
348 /*
349  * From a machine address, find the corresponding pseudo-physical address.
350  * Pseudo-physical address are contiguous and run from mfn_base in each VM.
351  * Machine addresses are the real underlying hardware addresses.
352  * These are needed for page table entries. Note that this routine is
353  * poorly protected. A bad value of "ma" will cause a page fault.
354  */
355 paddr_t
ma_to_pa(maddr_t ma)356 ma_to_pa(maddr_t ma)
357 {
358 	ulong_t pgoff = ma & MMU_PAGEOFFSET;
359 	ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
360 	paddr_t pa;
361 
362 	if (pfn >= xen_info->nr_pages)
363 		return (-(paddr_t)1);
364 	pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
365 #ifdef DEBUG
366 	if (ma != pa_to_ma(pa))
367 		dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
368 		    "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
369 #endif
370 	return (pa);
371 }
372 
373 /*
374  * From a pseudo-physical address, find the corresponding machine address.
375  */
376 maddr_t
pa_to_ma(paddr_t pa)377 pa_to_ma(paddr_t pa)
378 {
379 	pfn_t pfn;
380 	ulong_t mfn;
381 
382 	pfn = mmu_btop(pa - mfn_base);
383 	if (pa < mfn_base || pfn >= xen_info->nr_pages)
384 		dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
385 	mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
386 #ifdef DEBUG
387 	if (mfn_to_pfn_mapping[mfn] != pfn)
388 		dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
389 		    pfn, mfn, mfn_to_pfn_mapping[mfn]);
390 #endif
391 	return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
392 }
393 
394 #endif	/* __xpv */
395 
396 x86pte_t
get_pteval(paddr_t table,uint_t index)397 get_pteval(paddr_t table, uint_t index)
398 {
399 	if (pae_support)
400 		return (((x86pte_t *)(uintptr_t)table)[index]);
401 	return (((x86pte32_t *)(uintptr_t)table)[index]);
402 }
403 
404 /*ARGSUSED*/
405 void
set_pteval(paddr_t table,uint_t index,uint_t level,x86pte_t pteval)406 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
407 {
408 #ifdef __xpv
409 	mmu_update_t t;
410 	maddr_t mtable = pa_to_ma(table);
411 	int retcnt;
412 
413 	t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
414 	t.val = pteval;
415 	if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
416 		dboot_panic("HYPERVISOR_mmu_update() failed");
417 #else /* __xpv */
418 	uintptr_t tab_addr = (uintptr_t)table;
419 
420 	if (pae_support)
421 		((x86pte_t *)tab_addr)[index] = pteval;
422 	else
423 		((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
424 	if (level == top_level && level == 2)
425 		reload_cr3();
426 #endif /* __xpv */
427 }
428 
429 paddr_t
make_ptable(x86pte_t * pteval,uint_t level)430 make_ptable(x86pte_t *pteval, uint_t level)
431 {
432 	paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
433 
434 	if (level == top_level && level == 2)
435 		*pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
436 	else
437 		*pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
438 
439 #ifdef __xpv
440 	/* Remove write permission to the new page table. */
441 	if (HYPERVISOR_update_va_mapping(new_table,
442 	    *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
443 		dboot_panic("HYP_update_va_mapping error");
444 #endif
445 
446 	if (map_debug)
447 		dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
448 		    PRIx64 "\n", level, (ulong_t)new_table, *pteval);
449 	return (new_table);
450 }
451 
452 x86pte_t *
map_pte(paddr_t table,uint_t index)453 map_pte(paddr_t table, uint_t index)
454 {
455 	return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
456 }
457 
458 /*
459  * dump out the contents of page tables...
460  */
461 static void
dump_tables(void)462 dump_tables(void)
463 {
464 	uint_t save_index[4];	/* for recursion */
465 	char *save_table[4];	/* for recursion */
466 	uint_t	l;
467 	uint64_t va;
468 	uint64_t pgsize;
469 	int index;
470 	int i;
471 	x86pte_t pteval;
472 	char *table;
473 	static char *tablist = "\t\t\t";
474 	char *tabs = tablist + 3 - top_level;
475 	uint_t pa, pa1;
476 #if !defined(__xpv)
477 #define	maddr_t paddr_t
478 #endif /* !__xpv */
479 
480 	dboot_printf("Finished pagetables:\n");
481 	table = (char *)(uintptr_t)top_page_table;
482 	l = top_level;
483 	va = 0;
484 	for (index = 0; index < ptes_per_table; ++index) {
485 		pgsize = 1ull << shift_amt[l];
486 		if (pae_support)
487 			pteval = ((x86pte_t *)table)[index];
488 		else
489 			pteval = ((x86pte32_t *)table)[index];
490 		if (pteval == 0)
491 			goto next_entry;
492 
493 		dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
494 		    tabs + l, (void *)table, index, (uint64_t)pteval, va);
495 		pa = ma_to_pa(pteval & MMU_PAGEMASK);
496 		dboot_printf(" physaddr=%x\n", pa);
497 
498 		/*
499 		 * Don't try to walk hypervisor private pagetables
500 		 */
501 		if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
502 			save_table[l] = table;
503 			save_index[l] = index;
504 			--l;
505 			index = -1;
506 			table = (char *)(uintptr_t)
507 			    ma_to_pa(pteval & MMU_PAGEMASK);
508 			goto recursion;
509 		}
510 
511 		/*
512 		 * shorten dump for consecutive mappings
513 		 */
514 		for (i = 1; index + i < ptes_per_table; ++i) {
515 			if (pae_support)
516 				pteval = ((x86pte_t *)table)[index + i];
517 			else
518 				pteval = ((x86pte32_t *)table)[index + i];
519 			if (pteval == 0)
520 				break;
521 			pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
522 			if (pa1 != pa + i * pgsize)
523 				break;
524 		}
525 		if (i > 2) {
526 			dboot_printf("%s...\n", tabs + l);
527 			va += pgsize * (i - 2);
528 			index += i - 2;
529 		}
530 next_entry:
531 		va += pgsize;
532 		if (l == 3 && index == 255)	/* VA hole */
533 			va = 0xffff800000000000ull;
534 recursion:
535 		;
536 	}
537 	if (l < top_level) {
538 		++l;
539 		index = save_index[l];
540 		table = save_table[l];
541 		goto recursion;
542 	}
543 }
544 
545 /*
546  * Add a mapping for the machine page at the given virtual address.
547  */
548 static void
map_ma_at_va(maddr_t ma,native_ptr_t va,uint_t level)549 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
550 {
551 	x86pte_t *ptep;
552 	x86pte_t pteval;
553 
554 	pteval = ma | pte_bits;
555 	if (level > 0)
556 		pteval |= PT_PAGESIZE;
557 	if (va >= target_kernel_text && pge_support)
558 		pteval |= PT_GLOBAL;
559 
560 	if (map_debug && ma != va)
561 		dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
562 		    " pte=0x%" PRIx64 " l=%d\n",
563 		    (uint64_t)ma, (uint64_t)va, pteval, level);
564 
565 #if defined(__xpv)
566 	/*
567 	 * see if we can avoid find_pte() on the hypervisor
568 	 */
569 	if (HYPERVISOR_update_va_mapping(va, pteval,
570 	    UVMF_INVLPG | UVMF_LOCAL) == 0)
571 		return;
572 #endif
573 
574 	/*
575 	 * Find the pte that will map this address. This creates any
576 	 * missing intermediate level page tables
577 	 */
578 	ptep = find_pte(va, NULL, level, 0);
579 
580 	/*
581 	 * When paravirtualized, we must use hypervisor calls to modify the
582 	 * PTE, since paging is active. On real hardware we just write to
583 	 * the pagetables which aren't in use yet.
584 	 */
585 #if defined(__xpv)
586 	ptep = ptep;	/* shut lint up */
587 	if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
588 		dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
589 		    " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
590 		    (uint64_t)va, level, (uint64_t)ma, pteval);
591 #else
592 	if (va < 1024 * 1024)
593 		pteval |= PT_NOCACHE;		/* for video RAM */
594 	if (pae_support)
595 		*ptep = pteval;
596 	else
597 		*((x86pte32_t *)ptep) = (x86pte32_t)pteval;
598 #endif
599 }
600 
601 /*
602  * Add a mapping for the physical page at the given virtual address.
603  */
604 static void
map_pa_at_va(paddr_t pa,native_ptr_t va,uint_t level)605 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
606 {
607 	map_ma_at_va(pa_to_ma(pa), va, level);
608 }
609 
610 /*
611  * This is called to remove start..end from the
612  * possible range of PCI addresses.
613  */
614 const uint64_t pci_lo_limit = 0x00100000ul;
615 const uint64_t pci_hi_limit = 0xfff00000ul;
616 static void
exclude_from_pci(uint64_t start,uint64_t end)617 exclude_from_pci(uint64_t start, uint64_t end)
618 {
619 	int i;
620 	int j;
621 	struct boot_memlist *ml;
622 
623 	for (i = 0; i < pcimemlists_used; ++i) {
624 		ml = &pcimemlists[i];
625 
626 		/* delete the entire range? */
627 		if (start <= ml->addr && ml->addr + ml->size <= end) {
628 			--pcimemlists_used;
629 			for (j = i; j < pcimemlists_used; ++j)
630 				pcimemlists[j] = pcimemlists[j + 1];
631 			--i;	/* to revisit the new one at this index */
632 		}
633 
634 		/* split a range? */
635 		else if (ml->addr < start && end < ml->addr + ml->size) {
636 
637 			++pcimemlists_used;
638 			if (pcimemlists_used > MAX_MEMLIST)
639 				dboot_panic("too many pcimemlists");
640 
641 			for (j = pcimemlists_used - 1; j > i; --j)
642 				pcimemlists[j] = pcimemlists[j - 1];
643 			ml->size = start - ml->addr;
644 
645 			++ml;
646 			ml->size = (ml->addr + ml->size) - end;
647 			ml->addr = end;
648 			++i;	/* skip on to next one */
649 		}
650 
651 		/* cut memory off the start? */
652 		else if (ml->addr < end && end < ml->addr + ml->size) {
653 			ml->size -= end - ml->addr;
654 			ml->addr = end;
655 		}
656 
657 		/* cut memory off the end? */
658 		else if (ml->addr <= start && start < ml->addr + ml->size) {
659 			ml->size = start - ml->addr;
660 		}
661 	}
662 }
663 
664 /*
665  * During memory allocation, find the highest address not used yet.
666  */
667 static void
check_higher(paddr_t a)668 check_higher(paddr_t a)
669 {
670 	if (a < next_avail_addr)
671 		return;
672 	next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
673 	DBG(next_avail_addr);
674 }
675 
676 static int
dboot_loader_mmap_entries(void)677 dboot_loader_mmap_entries(void)
678 {
679 #if !defined(__xpv)
680 	if (num_entries_set == B_TRUE)
681 		return (num_entries);
682 
683 	switch (multiboot_version) {
684 	case 1:
685 		DBG(mb_info->flags);
686 		if (mb_info->flags & 0x40) {
687 			mb_memory_map_t *mmap;
688 			caddr32_t mmap_addr;
689 
690 			DBG(mb_info->mmap_addr);
691 			DBG(mb_info->mmap_length);
692 			check_higher(mb_info->mmap_addr + mb_info->mmap_length);
693 
694 			for (mmap_addr = mb_info->mmap_addr;
695 			    mmap_addr < mb_info->mmap_addr +
696 			    mb_info->mmap_length;
697 			    mmap_addr += mmap->size + sizeof (mmap->size)) {
698 				mmap = (mb_memory_map_t *)(uintptr_t)mmap_addr;
699 				++num_entries;
700 			}
701 
702 			num_entries_set = B_TRUE;
703 		}
704 		break;
705 	case 2:
706 		num_entries = dboot_multiboot2_efi_mmap_nentries(mb2_info);
707 		if (num_entries == 0)
708 			num_entries = dboot_multiboot2_mmap_nentries(mb2_info);
709 		if (num_entries == 0)
710 			dboot_panic("No memory map?\n");
711 		num_entries_set = B_TRUE;
712 		break;
713 	default:
714 		dboot_panic("Unknown multiboot version: %d\n",
715 		    multiboot_version);
716 		break;
717 	}
718 	return (num_entries);
719 #else
720 	return (MAXMAPS);
721 #endif
722 }
723 
724 #if !defined(__xpv)
725 static uint32_t
dboot_efi_to_smap_type(int index,uint32_t type)726 dboot_efi_to_smap_type(int index, uint32_t type)
727 {
728 	uint64_t addr;
729 
730 	/*
731 	 * ACPI 6.1 tells the lower memory should be reported as
732 	 * normal memory, so we enforce page 0 type even as
733 	 * vmware maps it as acpi reclaimable.
734 	 */
735 	if (dboot_multiboot2_efi_mmap_get_base(mb2_info, index, &addr)) {
736 		if (addr == 0)
737 			return (1);
738 	}
739 
740 	/*
741 	 * Translate UEFI memory types to SMAP types.
742 	 * See "ACPI Specification Release 6.5 Errata A"
743 	 * Table 15-6 (page 785), UEFI Memory Types and mapping to ACPI address
744 	 * range types.
745 	 */
746 
747 	switch (type) {
748 	case EfiLoaderCode:
749 	case EfiLoaderData:
750 	case EfiBootServicesCode:
751 	case EfiBootServicesData:
752 	case EfiConventionalMemory:
753 		return (1);
754 	case EfiReservedMemoryType:
755 	case EfiRuntimeServicesCode:
756 	case EfiRuntimeServicesData:
757 	case EfiMemoryMappedIO:
758 	case EfiMemoryMappedIOPortSpace:
759 	case EfiPalCode:
760 	case EfiUnusableMemory:
761 		return (2);
762 	case EfiACPIReclaimMemory:
763 		return (3);
764 	case EfiACPIMemoryNVS:
765 		return (4);
766 	}
767 
768 	return (2);
769 }
770 #endif
771 
772 static uint32_t
dboot_loader_mmap_get_type(int index)773 dboot_loader_mmap_get_type(int index)
774 {
775 #if !defined(__xpv)
776 	mb_memory_map_t *mp, *mpend;
777 	uint32_t type;
778 	int i;
779 
780 	switch (multiboot_version) {
781 	case 1:
782 		mp = (mb_memory_map_t *)(uintptr_t)mb_info->mmap_addr;
783 		mpend = (mb_memory_map_t *)(uintptr_t)
784 		    (mb_info->mmap_addr + mb_info->mmap_length);
785 
786 		for (i = 0; mp < mpend && i != index; i++)
787 			mp = (mb_memory_map_t *)((uintptr_t)mp + mp->size +
788 			    sizeof (mp->size));
789 		if (mp >= mpend) {
790 			dboot_panic("dboot_loader_mmap_get_type(): index "
791 			    "out of bounds: %d\n", index);
792 		}
793 		return (mp->type);
794 
795 	case 2:
796 		if (dboot_multiboot2_efi_mmap_get_type(mb2_info, index, &type))
797 			return (dboot_efi_to_smap_type(index, type));
798 
799 		if (dboot_multiboot2_mmap_get_type(mb2_info, index, &type))
800 			return (type);
801 
802 		dboot_panic("Can not get memory type for %d\n", index);
803 
804 	default:
805 		dboot_panic("Unknown multiboot version: %d\n",
806 		    multiboot_version);
807 		break;
808 	}
809 	return (0);
810 #else
811 	return (map_buffer[index].type);
812 #endif
813 }
814 
815 static uint64_t
dboot_loader_mmap_get_base(int index)816 dboot_loader_mmap_get_base(int index)
817 {
818 #if !defined(__xpv)
819 	mb_memory_map_t *mp, *mpend;
820 	uint64_t base;
821 	int i;
822 
823 	switch (multiboot_version) {
824 	case 1:
825 		mp = (mb_memory_map_t *)mb_info->mmap_addr;
826 		mpend = (mb_memory_map_t *)
827 		    (mb_info->mmap_addr + mb_info->mmap_length);
828 
829 		for (i = 0; mp < mpend && i != index; i++)
830 			mp = (mb_memory_map_t *)((uintptr_t)mp + mp->size +
831 			    sizeof (mp->size));
832 		if (mp >= mpend) {
833 			dboot_panic("dboot_loader_mmap_get_base(): index "
834 			    "out of bounds: %d\n", index);
835 		}
836 		return (((uint64_t)mp->base_addr_high << 32) +
837 		    (uint64_t)mp->base_addr_low);
838 
839 	case 2:
840 		if (dboot_multiboot2_efi_mmap_get_base(mb2_info, index, &base))
841 			return (base);
842 
843 		if (dboot_multiboot2_mmap_get_base(mb2_info, index, &base))
844 			return (base);
845 
846 		dboot_panic("Can not get memory address for %d\n", index);
847 
848 	default:
849 		dboot_panic("Unknown multiboot version: %d\n",
850 		    multiboot_version);
851 		break;
852 	}
853 	return (0);
854 #else
855 	return (((uint64_t)map_buffer[index].base_addr_high << 32) +
856 	    (uint64_t)map_buffer[index].base_addr_low);
857 #endif
858 }
859 
860 static uint64_t
dboot_loader_mmap_get_length(int index)861 dboot_loader_mmap_get_length(int index)
862 {
863 #if !defined(__xpv)
864 	mb_memory_map_t *mp, *mpend;
865 	uint64_t length;
866 	int i;
867 
868 	switch (multiboot_version) {
869 	case 1:
870 		mp = (mb_memory_map_t *)mb_info->mmap_addr;
871 		mpend = (mb_memory_map_t *)
872 		    (mb_info->mmap_addr + mb_info->mmap_length);
873 
874 		for (i = 0; mp < mpend && i != index; i++)
875 			mp = (mb_memory_map_t *)((uintptr_t)mp + mp->size +
876 			    sizeof (mp->size));
877 		if (mp >= mpend) {
878 			dboot_panic("dboot_loader_mmap_get_length(): index "
879 			    "out of bounds: %d\n", index);
880 		}
881 		return (((uint64_t)mp->length_high << 32) +
882 		    (uint64_t)mp->length_low);
883 
884 	case 2:
885 		if (dboot_multiboot2_efi_mmap_get_length(mb2_info,
886 		    index, &length))
887 			return (length);
888 
889 		if (dboot_multiboot2_mmap_get_length(mb2_info,
890 		    index, &length))
891 			return (length);
892 
893 		dboot_panic("Can not get memory length for %d\n", index);
894 
895 	default:
896 		dboot_panic("Unknown multiboot version: %d\n",
897 		    multiboot_version);
898 		break;
899 	}
900 	return (0);
901 #else
902 	return (((uint64_t)map_buffer[index].length_high << 32) +
903 	    (uint64_t)map_buffer[index].length_low);
904 #endif
905 }
906 
907 static void
build_pcimemlists(void)908 build_pcimemlists(void)
909 {
910 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
911 	uint64_t start;
912 	uint64_t end;
913 	int i, num;
914 
915 	if (prom_debug)
916 		dboot_printf("building pcimemlists:\n");
917 	/*
918 	 * initialize
919 	 */
920 	pcimemlists[0].addr = pci_lo_limit;
921 	pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
922 	pcimemlists_used = 1;
923 
924 	num = dboot_loader_mmap_entries();
925 	/*
926 	 * Fill in PCI memlists.
927 	 */
928 	for (i = 0; i < num; ++i) {
929 		start = dboot_loader_mmap_get_base(i);
930 		end = start + dboot_loader_mmap_get_length(i);
931 
932 		if (prom_debug)
933 			dboot_printf("\ttype: %d %" PRIx64 "..%"
934 			    PRIx64 "\n", dboot_loader_mmap_get_type(i),
935 			    start, end);
936 
937 		/*
938 		 * page align start and end
939 		 */
940 		start = (start + page_offset) & ~page_offset;
941 		end &= ~page_offset;
942 		if (end <= start)
943 			continue;
944 
945 		exclude_from_pci(start, end);
946 	}
947 
948 	/*
949 	 * Finish off the pcimemlist
950 	 */
951 	if (prom_debug) {
952 		for (i = 0; i < pcimemlists_used; ++i) {
953 			dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
954 			    PRIx64 "\n", pcimemlists[i].addr,
955 			    pcimemlists[i].addr + pcimemlists[i].size);
956 		}
957 	}
958 	pcimemlists[0].next = 0;
959 	pcimemlists[0].prev = 0;
960 	for (i = 1; i < pcimemlists_used; ++i) {
961 		pcimemlists[i].prev =
962 		    (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
963 		pcimemlists[i].next = 0;
964 		pcimemlists[i - 1].next =
965 		    (native_ptr_t)(uintptr_t)(pcimemlists + i);
966 	}
967 	bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
968 	DBG(bi->bi_pcimem);
969 }
970 
971 #if defined(__xpv)
972 /*
973  * Initialize memory allocator stuff from hypervisor-supplied start info.
974  */
975 static void
init_mem_alloc(void)976 init_mem_alloc(void)
977 {
978 	int	local;	/* variables needed to find start region */
979 	paddr_t	scratch_start;
980 	xen_memory_map_t map;
981 
982 	DBG_MSG("Entered init_mem_alloc()\n");
983 
984 	/*
985 	 * Free memory follows the stack. There's at least 512KB of scratch
986 	 * space, rounded up to at least 2Mb alignment.  That should be enough
987 	 * for the page tables we'll need to build.  The nucleus memory is
988 	 * allocated last and will be outside the addressible range.  We'll
989 	 * switch to new page tables before we unpack the kernel
990 	 */
991 	scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
992 	DBG(scratch_start);
993 	scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
994 	DBG(scratch_end);
995 
996 	/*
997 	 * For paranoia, leave some space between hypervisor data and ours.
998 	 * Use 500 instead of 512.
999 	 */
1000 	next_avail_addr = scratch_end - 500 * 1024;
1001 	DBG(next_avail_addr);
1002 
1003 	/*
1004 	 * The domain builder gives us at most 1 module
1005 	 */
1006 	DBG(xen_info->mod_len);
1007 	if (xen_info->mod_len > 0) {
1008 		DBG(xen_info->mod_start);
1009 		modules[0].bm_addr =
1010 		    (native_ptr_t)(uintptr_t)xen_info->mod_start;
1011 		modules[0].bm_size = xen_info->mod_len;
1012 		bi->bi_module_cnt = 1;
1013 		bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1014 	} else {
1015 		bi->bi_module_cnt = 0;
1016 		bi->bi_modules = (native_ptr_t)(uintptr_t)NULL;
1017 	}
1018 	DBG(bi->bi_module_cnt);
1019 	DBG(bi->bi_modules);
1020 
1021 	DBG(xen_info->mfn_list);
1022 	DBG(xen_info->nr_pages);
1023 	max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
1024 	DBG(max_mem);
1025 
1026 	/*
1027 	 * Using pseudo-physical addresses, so only 1 memlist element
1028 	 */
1029 	memlists[0].addr = 0;
1030 	DBG(memlists[0].addr);
1031 	memlists[0].size = max_mem;
1032 	DBG(memlists[0].size);
1033 	memlists_used = 1;
1034 	DBG(memlists_used);
1035 
1036 	/*
1037 	 * finish building physinstall list
1038 	 */
1039 	sort_physinstall();
1040 
1041 	/*
1042 	 * build bios reserved memlists
1043 	 */
1044 	build_rsvdmemlists();
1045 
1046 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1047 		/*
1048 		 * build PCI Memory list
1049 		 */
1050 		map.nr_entries = MAXMAPS;
1051 		/*LINTED: constant in conditional context*/
1052 		set_xen_guest_handle(map.buffer, map_buffer);
1053 		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
1054 			dboot_panic("getting XENMEM_machine_memory_map failed");
1055 		build_pcimemlists();
1056 	}
1057 }
1058 
1059 #else	/* !__xpv */
1060 
1061 static void
dboot_multiboot1_xboot_consinfo(void)1062 dboot_multiboot1_xboot_consinfo(void)
1063 {
1064 	fb->framebuffer = 0;
1065 }
1066 
1067 static void
dboot_multiboot2_xboot_consinfo(void)1068 dboot_multiboot2_xboot_consinfo(void)
1069 {
1070 	multiboot_tag_framebuffer_t *fbtag;
1071 	fbtag = dboot_multiboot2_find_tag(mb2_info,
1072 	    MULTIBOOT_TAG_TYPE_FRAMEBUFFER);
1073 	fb->framebuffer = (uint64_t)(uintptr_t)fbtag;
1074 }
1075 
1076 static int
dboot_multiboot_modcount(void)1077 dboot_multiboot_modcount(void)
1078 {
1079 	switch (multiboot_version) {
1080 	case 1:
1081 		return (mb_info->mods_count);
1082 
1083 	case 2:
1084 		return (dboot_multiboot2_modcount(mb2_info));
1085 
1086 	default:
1087 		dboot_panic("Unknown multiboot version: %d\n",
1088 		    multiboot_version);
1089 		break;
1090 	}
1091 	return (0);
1092 }
1093 
1094 static uint32_t
dboot_multiboot_modstart(int index)1095 dboot_multiboot_modstart(int index)
1096 {
1097 	switch (multiboot_version) {
1098 	case 1:
1099 		return (((mb_module_t *)mb_info->mods_addr)[index].mod_start);
1100 
1101 	case 2:
1102 		return (dboot_multiboot2_modstart(mb2_info, index));
1103 
1104 	default:
1105 		dboot_panic("Unknown multiboot version: %d\n",
1106 		    multiboot_version);
1107 		break;
1108 	}
1109 	return (0);
1110 }
1111 
1112 static uint32_t
dboot_multiboot_modend(int index)1113 dboot_multiboot_modend(int index)
1114 {
1115 	switch (multiboot_version) {
1116 	case 1:
1117 		return (((mb_module_t *)mb_info->mods_addr)[index].mod_end);
1118 
1119 	case 2:
1120 		return (dboot_multiboot2_modend(mb2_info, index));
1121 
1122 	default:
1123 		dboot_panic("Unknown multiboot version: %d\n",
1124 		    multiboot_version);
1125 		break;
1126 	}
1127 	return (0);
1128 }
1129 
1130 static char *
dboot_multiboot_modcmdline(int index)1131 dboot_multiboot_modcmdline(int index)
1132 {
1133 	switch (multiboot_version) {
1134 	case 1:
1135 		return ((char *)((mb_module_t *)
1136 		    mb_info->mods_addr)[index].mod_name);
1137 
1138 	case 2:
1139 		return (dboot_multiboot2_modcmdline(mb2_info, index));
1140 
1141 	default:
1142 		dboot_panic("Unknown multiboot version: %d\n",
1143 		    multiboot_version);
1144 		break;
1145 	}
1146 	return (0);
1147 }
1148 
1149 /*
1150  * Find the modules used by console setup.
1151  * Since we need the console to print early boot messages, the console is set up
1152  * before anything else and therefore we need to pick up the needed modules.
1153  *
1154  * Note, we just will search for and if found, will pass the modules
1155  * to console setup, the proper module list processing will happen later.
1156  * Currently used modules are boot environment and console font.
1157  */
1158 static void
dboot_find_console_modules(void)1159 dboot_find_console_modules(void)
1160 {
1161 	int i, modcount;
1162 	uint32_t mod_start, mod_end;
1163 	char *cmdline;
1164 
1165 	modcount = dboot_multiboot_modcount();
1166 	bi->bi_module_cnt = 0;
1167 	for (i = 0; i < modcount; ++i) {
1168 		cmdline = dboot_multiboot_modcmdline(i);
1169 		if (cmdline == NULL)
1170 			continue;
1171 
1172 		if (strstr(cmdline, "type=console-font") != NULL)
1173 			modules[bi->bi_module_cnt].bm_type = BMT_FONT;
1174 		else if (strstr(cmdline, "type=environment") != NULL)
1175 			modules[bi->bi_module_cnt].bm_type = BMT_ENV;
1176 		else
1177 			continue;
1178 
1179 		mod_start = dboot_multiboot_modstart(i);
1180 		mod_end = dboot_multiboot_modend(i);
1181 		modules[bi->bi_module_cnt].bm_addr =
1182 		    (native_ptr_t)(uintptr_t)mod_start;
1183 		modules[bi->bi_module_cnt].bm_size = mod_end - mod_start;
1184 		modules[bi->bi_module_cnt].bm_name =
1185 		    (native_ptr_t)(uintptr_t)NULL;
1186 		modules[bi->bi_module_cnt].bm_hash =
1187 		    (native_ptr_t)(uintptr_t)NULL;
1188 		bi->bi_module_cnt++;
1189 	}
1190 	if (bi->bi_module_cnt != 0)
1191 		bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1192 }
1193 
1194 static boolean_t
dboot_multiboot_basicmeminfo(uint32_t * lower,uint32_t * upper)1195 dboot_multiboot_basicmeminfo(uint32_t *lower, uint32_t *upper)
1196 {
1197 	boolean_t rv = B_FALSE;
1198 
1199 	switch (multiboot_version) {
1200 	case 1:
1201 		if (mb_info->flags & 0x01) {
1202 			*lower = mb_info->mem_lower;
1203 			*upper = mb_info->mem_upper;
1204 			rv = B_TRUE;
1205 		}
1206 		break;
1207 
1208 	case 2:
1209 		return (dboot_multiboot2_basicmeminfo(mb2_info, lower, upper));
1210 
1211 	default:
1212 		dboot_panic("Unknown multiboot version: %d\n",
1213 		    multiboot_version);
1214 		break;
1215 	}
1216 	return (rv);
1217 }
1218 
1219 static uint8_t
dboot_a2h(char v)1220 dboot_a2h(char v)
1221 {
1222 	if (v >= 'a')
1223 		return (v - 'a' + 0xa);
1224 	else if (v >= 'A')
1225 		return (v - 'A' + 0xa);
1226 	else if (v >= '0')
1227 		return (v - '0');
1228 	else
1229 		dboot_panic("bad ASCII hex character %c\n", v);
1230 
1231 	return (0);
1232 }
1233 
1234 static void
digest_a2h(const char * ascii,uint8_t * digest)1235 digest_a2h(const char *ascii, uint8_t *digest)
1236 {
1237 	unsigned int i;
1238 
1239 	for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1240 		digest[i] = dboot_a2h(ascii[i * 2]) << 4;
1241 		digest[i] |= dboot_a2h(ascii[i * 2 + 1]);
1242 	}
1243 }
1244 
1245 /*
1246  * Generate a SHA-1 hash of the first len bytes of image, and compare it with
1247  * the ASCII-format hash found in the 40-byte buffer at ascii.  If they
1248  * match, return 0, otherwise -1.  This works only for images smaller than
1249  * 4 GB, which should not be a problem.
1250  */
1251 static int
check_image_hash(uint_t midx)1252 check_image_hash(uint_t midx)
1253 {
1254 	const char *ascii;
1255 	const void *image;
1256 	size_t len;
1257 	SHA1_CTX ctx;
1258 	uint8_t digest[SHA1_DIGEST_LENGTH];
1259 	uint8_t baseline[SHA1_DIGEST_LENGTH];
1260 	unsigned int i;
1261 
1262 	ascii = (const char *)(uintptr_t)modules[midx].bm_hash;
1263 	image = (const void *)(uintptr_t)modules[midx].bm_addr;
1264 	len = (size_t)modules[midx].bm_size;
1265 
1266 	digest_a2h(ascii, baseline);
1267 
1268 	SHA1Init(&ctx);
1269 	SHA1Update(&ctx, image, len);
1270 	SHA1Final(digest, &ctx);
1271 
1272 	for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1273 		if (digest[i] != baseline[i])
1274 			return (-1);
1275 	}
1276 
1277 	return (0);
1278 }
1279 
1280 static const char *
type_to_str(boot_module_type_t type)1281 type_to_str(boot_module_type_t type)
1282 {
1283 	switch (type) {
1284 	case BMT_ROOTFS:
1285 		return ("rootfs");
1286 	case BMT_FILE:
1287 		return ("file");
1288 	case BMT_HASH:
1289 		return ("hash");
1290 	case BMT_ENV:
1291 		return ("environment");
1292 	case BMT_FONT:
1293 		return ("console-font");
1294 	default:
1295 		return ("unknown");
1296 	}
1297 }
1298 
1299 static void
check_images(void)1300 check_images(void)
1301 {
1302 	uint_t i;
1303 	char displayhash[SHA1_ASCII_LENGTH + 1];
1304 
1305 	for (i = 0; i < modules_used; i++) {
1306 		if (prom_debug) {
1307 			dboot_printf("module #%d: name %s type %s "
1308 			    "addr %lx size %lx\n",
1309 			    i, (char *)(uintptr_t)modules[i].bm_name,
1310 			    type_to_str(modules[i].bm_type),
1311 			    (ulong_t)modules[i].bm_addr,
1312 			    (ulong_t)modules[i].bm_size);
1313 		}
1314 
1315 		if (modules[i].bm_type == BMT_HASH ||
1316 		    modules[i].bm_hash == (native_ptr_t)(uintptr_t)NULL) {
1317 			DBG_MSG("module has no hash; skipping check\n");
1318 			continue;
1319 		}
1320 		(void) memcpy(displayhash,
1321 		    (void *)(uintptr_t)modules[i].bm_hash,
1322 		    SHA1_ASCII_LENGTH);
1323 		displayhash[SHA1_ASCII_LENGTH] = '\0';
1324 		if (prom_debug) {
1325 			dboot_printf("checking expected hash [%s]: ",
1326 			    displayhash);
1327 		}
1328 
1329 		if (check_image_hash(i) != 0)
1330 			dboot_panic("hash mismatch!\n");
1331 		else
1332 			DBG_MSG("OK\n");
1333 	}
1334 }
1335 
1336 /*
1337  * Determine the module's starting address, size, name, and type, and fill the
1338  * boot_modules structure.  This structure is used by the bop code, except for
1339  * hashes which are checked prior to transferring control to the kernel.
1340  */
1341 static void
process_module(int midx)1342 process_module(int midx)
1343 {
1344 	uint32_t mod_start = dboot_multiboot_modstart(midx);
1345 	uint32_t mod_end = dboot_multiboot_modend(midx);
1346 	char *cmdline = dboot_multiboot_modcmdline(midx);
1347 	char *p, *q;
1348 
1349 	check_higher(mod_end);
1350 	if (prom_debug) {
1351 		dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n",
1352 		    midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end);
1353 	}
1354 
1355 	if (mod_start > mod_end) {
1356 		dboot_panic("module #%d: module start address 0x%lx greater "
1357 		    "than end address 0x%lx", midx,
1358 		    (ulong_t)mod_start, (ulong_t)mod_end);
1359 	}
1360 
1361 	/*
1362 	 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
1363 	 * the address of the last valid byte in a module plus 1 as mod_end.
1364 	 * This is of course a bug; the multiboot specification simply states
1365 	 * that mod_start and mod_end "contain the start and end addresses of
1366 	 * the boot module itself" which is pretty obviously not what GRUB is
1367 	 * doing.  However, fixing it requires that not only this code be
1368 	 * changed but also that other code consuming this value and values
1369 	 * derived from it be fixed, and that the kernel and GRUB must either
1370 	 * both have the bug or neither.  While there are a lot of combinations
1371 	 * that will work, there are also some that won't, so for simplicity
1372 	 * we'll just cope with the bug.  That means we won't actually hash the
1373 	 * byte at mod_end, and we will expect that mod_end for the hash file
1374 	 * itself is one greater than some multiple of 41 (40 bytes of ASCII
1375 	 * hash plus a newline for each module).  We set bm_size to the true
1376 	 * correct number of bytes in each module, achieving exactly this.
1377 	 */
1378 
1379 	modules[midx].bm_addr = (native_ptr_t)(uintptr_t)mod_start;
1380 	modules[midx].bm_size = mod_end - mod_start;
1381 	modules[midx].bm_name = (native_ptr_t)(uintptr_t)cmdline;
1382 	modules[midx].bm_hash = (native_ptr_t)(uintptr_t)NULL;
1383 	modules[midx].bm_type = BMT_FILE;
1384 
1385 	if (cmdline == NULL) {
1386 		modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname;
1387 		return;
1388 	}
1389 
1390 	p = cmdline;
1391 	modules[midx].bm_name =
1392 	    (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r");
1393 
1394 	while (p != NULL) {
1395 		q = strsep(&p, " \t\f\n\r");
1396 		if (strncmp(q, "name=", 5) == 0) {
1397 			if (q[5] != '\0' && !isspace(q[5])) {
1398 				modules[midx].bm_name =
1399 				    (native_ptr_t)(uintptr_t)(q + 5);
1400 			}
1401 			continue;
1402 		}
1403 
1404 		if (strncmp(q, "type=", 5) == 0) {
1405 			if (q[5] == '\0' || isspace(q[5]))
1406 				continue;
1407 			q += 5;
1408 			if (strcmp(q, "rootfs") == 0) {
1409 				modules[midx].bm_type = BMT_ROOTFS;
1410 			} else if (strcmp(q, "hash") == 0) {
1411 				modules[midx].bm_type = BMT_HASH;
1412 			} else if (strcmp(q, "environment") == 0) {
1413 				modules[midx].bm_type = BMT_ENV;
1414 			} else if (strcmp(q, "console-font") == 0) {
1415 				modules[midx].bm_type = BMT_FONT;
1416 			} else if (strcmp(q, "file") != 0) {
1417 				dboot_printf("\tmodule #%d: unknown module "
1418 				    "type '%s'; defaulting to 'file'\n",
1419 				    midx, q);
1420 			}
1421 			continue;
1422 		}
1423 
1424 		if (strncmp(q, "hash=", 5) == 0) {
1425 			if (q[5] != '\0' && !isspace(q[5])) {
1426 				modules[midx].bm_hash =
1427 				    (native_ptr_t)(uintptr_t)(q + 5);
1428 			}
1429 			continue;
1430 		}
1431 
1432 		dboot_printf("ignoring unknown option '%s'\n", q);
1433 	}
1434 }
1435 
1436 /*
1437  * Backward compatibility: if there are exactly one or two modules, both
1438  * of type 'file' and neither with an embedded hash value, we have been
1439  * given the legacy style modules.  In this case we need to treat the first
1440  * module as a rootfs and the second as a hash referencing that module.
1441  * Otherwise, even if the configuration is invalid, we assume that the
1442  * operator knows what he's doing or at least isn't being bitten by this
1443  * interface change.
1444  */
1445 static void
fixup_modules(void)1446 fixup_modules(void)
1447 {
1448 	if (modules_used == 0 || modules_used > 2)
1449 		return;
1450 
1451 	if (modules[0].bm_type != BMT_FILE ||
1452 	    (modules_used > 1 && modules[1].bm_type != BMT_FILE)) {
1453 		return;
1454 	}
1455 
1456 	if (modules[0].bm_hash != (native_ptr_t)(uintptr_t)NULL ||
1457 	    (modules_used > 1 &&
1458 	    modules[1].bm_hash != (native_ptr_t)(uintptr_t)NULL)) {
1459 		return;
1460 	}
1461 
1462 	modules[0].bm_type = BMT_ROOTFS;
1463 	if (modules_used > 1) {
1464 		modules[1].bm_type = BMT_HASH;
1465 		modules[1].bm_name = modules[0].bm_name;
1466 	}
1467 }
1468 
1469 /*
1470  * For modules that do not have assigned hashes but have a separate hash module,
1471  * find the assigned hash module and set the primary module's bm_hash to point
1472  * to the hash data from that module.  We will then ignore modules of type
1473  * BMT_HASH from this point forward.
1474  */
1475 static void
assign_module_hashes(void)1476 assign_module_hashes(void)
1477 {
1478 	uint_t i, j;
1479 
1480 	for (i = 0; i < modules_used; i++) {
1481 		if (modules[i].bm_type == BMT_HASH ||
1482 		    modules[i].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1483 			continue;
1484 		}
1485 
1486 		for (j = 0; j < modules_used; j++) {
1487 			if (modules[j].bm_type != BMT_HASH ||
1488 			    strcmp((char *)(uintptr_t)modules[j].bm_name,
1489 			    (char *)(uintptr_t)modules[i].bm_name) != 0) {
1490 				continue;
1491 			}
1492 
1493 			if (modules[j].bm_size < SHA1_ASCII_LENGTH) {
1494 				dboot_printf("Short hash module of length "
1495 				    "0x%lx bytes; ignoring\n",
1496 				    (ulong_t)modules[j].bm_size);
1497 			} else {
1498 				modules[i].bm_hash = modules[j].bm_addr;
1499 			}
1500 			break;
1501 		}
1502 	}
1503 }
1504 
1505 /*
1506  * Walk through the module information finding the last used address.
1507  * The first available address will become the top level page table.
1508  */
1509 static void
dboot_process_modules(void)1510 dboot_process_modules(void)
1511 {
1512 	int i, modcount;
1513 	extern char _end[];
1514 
1515 	DBG_MSG("\nFinding Modules\n");
1516 	modcount = dboot_multiboot_modcount();
1517 	if (modcount > MAX_BOOT_MODULES) {
1518 		dboot_panic("Too many modules (%d) -- the maximum is %d.",
1519 		    modcount, MAX_BOOT_MODULES);
1520 	}
1521 	/*
1522 	 * search the modules to find the last used address
1523 	 * we'll build the module list while we're walking through here
1524 	 */
1525 	check_higher((paddr_t)(uintptr_t)&_end);
1526 	for (i = 0; i < modcount; ++i) {
1527 		process_module(i);
1528 		modules_used++;
1529 	}
1530 	bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1531 	DBG(bi->bi_modules);
1532 	bi->bi_module_cnt = modcount;
1533 	DBG(bi->bi_module_cnt);
1534 
1535 	fixup_modules();
1536 	assign_module_hashes();
1537 	check_images();
1538 }
1539 
1540 /*
1541  * We then build the phys_install memlist from the multiboot information.
1542  */
1543 static void
dboot_process_mmap(void)1544 dboot_process_mmap(void)
1545 {
1546 	uint64_t start;
1547 	uint64_t end;
1548 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
1549 	uint32_t lower, upper, type, t;
1550 	int i, mmap_entries;
1551 
1552 	/*
1553 	 * Walk through the memory map from multiboot and build our memlist
1554 	 * structures. Note these will have native format pointers.
1555 	 */
1556 	DBG_MSG("\nFinding Memory Map\n");
1557 	num_entries = 0;
1558 	num_entries_set = B_FALSE;
1559 	max_mem = 0;
1560 	t = 0;
1561 	if ((mmap_entries = dboot_loader_mmap_entries()) > 0) {
1562 		struct boot_memlist *mlist;
1563 		uint_t *indexp;
1564 
1565 		for (i = 0; i < mmap_entries; i++) {
1566 			start = dboot_loader_mmap_get_base(i);
1567 			end = start + dboot_loader_mmap_get_length(i);
1568 			type = dboot_loader_mmap_get_type(i);
1569 
1570 			if (prom_debug)
1571 				dboot_printf("\ttype: %u %" PRIx64 "..%"
1572 				    PRIx64 "\n", type, start, end);
1573 
1574 			/*
1575 			 * page align start and end
1576 			 */
1577 			start = (start + page_offset) & ~page_offset;
1578 			end &= ~page_offset;
1579 			if (end <= start)
1580 				continue;
1581 
1582 			/*
1583 			 * only type 1 is usable RAM
1584 			 */
1585 			switch (type) {
1586 			case 1:
1587 				if (end > max_mem)
1588 					max_mem = end;
1589 				mlist = memlists;
1590 				indexp = &memlists_used;
1591 				break;
1592 			case 2:
1593 				mlist = rsvdmemlists;
1594 				indexp = &rsvdmemlists_used;
1595 				break;
1596 			default:
1597 				continue;
1598 			}
1599 
1600 			if (memlists_used > MAX_MEMLIST)
1601 				dboot_panic("too many memlists");
1602 			if (rsvdmemlists_used > MAX_MEMLIST)
1603 				dboot_panic("too many rsvdmemlists");
1604 
1605 			if (mlist[*indexp].size != 0 &&
1606 			    type == t &&
1607 			    (mlist[*indexp].addr +
1608 			    mlist[*indexp].size) == start) {
1609 				mlist[*indexp].size =
1610 				    end - mlist[*indexp].addr;
1611 				continue;
1612 			}
1613 			/* do we need new entry? */
1614 			if (mlist[*indexp].size != 0) {
1615 				*indexp = *indexp + 1;
1616 				if (*indexp > MAX_MEMLIST)
1617 					continue;
1618 			}
1619 
1620 			t = type;
1621 			mlist[*indexp].addr = start;
1622 			mlist[*indexp].size = end - start;
1623 		}
1624 
1625 		if (memlists[memlists_used].size != 0) {
1626 			memlists_used++;
1627 		}
1628 		if (rsvdmemlists[rsvdmemlists_used].size != 0) {
1629 			rsvdmemlists_used++;
1630 		}
1631 
1632 		if (prom_debug) {
1633 			for (i = 0; i < memlists_used; i++) {
1634 				dboot_printf("memlists[%u] %"
1635 				    PRIx64 "..%" PRIx64 "\n",
1636 				    i,
1637 				    memlists[i].addr,
1638 				    memlists[i].size);
1639 			}
1640 			for (i = 0; i < rsvdmemlists_used; i++) {
1641 				dboot_printf("rsvdmemlists[%u] %"
1642 				    PRIx64 "..%" PRIx64 "\n",
1643 				    i,
1644 				    rsvdmemlists[i].addr,
1645 				    rsvdmemlists[i].size);
1646 			}
1647 		}
1648 
1649 		build_pcimemlists();
1650 	} else if (dboot_multiboot_basicmeminfo(&lower, &upper)) {
1651 		DBG(lower);
1652 		memlists[memlists_used].addr = 0;
1653 		memlists[memlists_used].size = lower * 1024;
1654 		++memlists_used;
1655 		DBG(upper);
1656 		memlists[memlists_used].addr = 1024 * 1024;
1657 		memlists[memlists_used].size = upper * 1024;
1658 		++memlists_used;
1659 
1660 		/*
1661 		 * Old platform - assume I/O space at the end of memory.
1662 		 */
1663 		pcimemlists[0].addr = (upper * 1024) + (1024 * 1024);
1664 		pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
1665 		pcimemlists[0].next = 0;
1666 		pcimemlists[0].prev = 0;
1667 		bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
1668 		DBG(bi->bi_pcimem);
1669 	} else {
1670 		dboot_panic("No memory info from boot loader!!!");
1671 	}
1672 
1673 	/*
1674 	 * finish processing the physinstall list
1675 	 */
1676 	sort_physinstall();
1677 
1678 	/*
1679 	 * build bios reserved mem lists
1680 	 */
1681 	build_rsvdmemlists();
1682 }
1683 
1684 /*
1685  * The highest address is used as the starting point for dboot's simple
1686  * memory allocator.
1687  *
1688  * Finding the highest address in case of Multiboot 1 protocol is
1689  * quite painful in the sense that some information provided by
1690  * the multiboot info structure points to BIOS data, and some to RAM.
1691  *
1692  * The module list was processed and checked already by dboot_process_modules(),
1693  * so we will check the command line string and the memory map.
1694  *
1695  * This list of to be checked items is based on our current knowledge of
1696  * allocations made by grub1 and will need to be reviewed if there
1697  * are updates about the information provided by Multiboot 1.
1698  *
1699  * In the case of the Multiboot 2, our life is much simpler, as the MB2
1700  * information tag list is one contiguous chunk of memory.
1701  */
1702 static paddr_t
dboot_multiboot1_highest_addr(void)1703 dboot_multiboot1_highest_addr(void)
1704 {
1705 	paddr_t addr = (paddr_t)(uintptr_t)NULL;
1706 	char *cmdl = (char *)mb_info->cmdline;
1707 
1708 	if (mb_info->flags & MB_INFO_CMDLINE)
1709 		addr = ((paddr_t)((uintptr_t)cmdl + strlen(cmdl) + 1));
1710 
1711 	if (mb_info->flags & MB_INFO_MEM_MAP)
1712 		addr = MAX(addr,
1713 		    ((paddr_t)(mb_info->mmap_addr + mb_info->mmap_length)));
1714 	return (addr);
1715 }
1716 
1717 static void
dboot_multiboot_highest_addr(void)1718 dboot_multiboot_highest_addr(void)
1719 {
1720 	paddr_t addr;
1721 
1722 	switch (multiboot_version) {
1723 	case 1:
1724 		addr = dboot_multiboot1_highest_addr();
1725 		if (addr != (paddr_t)(uintptr_t)NULL)
1726 			check_higher(addr);
1727 		break;
1728 	case 2:
1729 		addr = dboot_multiboot2_highest_addr(mb2_info);
1730 		if (addr != (paddr_t)(uintptr_t)NULL)
1731 			check_higher(addr);
1732 		break;
1733 	default:
1734 		dboot_panic("Unknown multiboot version: %d\n",
1735 		    multiboot_version);
1736 		break;
1737 	}
1738 }
1739 
1740 /*
1741  * Walk the boot loader provided information and find the highest free address.
1742  */
1743 static void
init_mem_alloc(void)1744 init_mem_alloc(void)
1745 {
1746 	DBG_MSG("Entered init_mem_alloc()\n");
1747 	dboot_process_modules();
1748 	dboot_process_mmap();
1749 	dboot_multiboot_highest_addr();
1750 }
1751 
1752 static int
dboot_same_guids(efi_guid_t * g1,efi_guid_t * g2)1753 dboot_same_guids(efi_guid_t *g1, efi_guid_t *g2)
1754 {
1755 	int i;
1756 
1757 	if (g1->time_low != g2->time_low)
1758 		return (0);
1759 	if (g1->time_mid != g2->time_mid)
1760 		return (0);
1761 	if (g1->time_hi_and_version != g2->time_hi_and_version)
1762 		return (0);
1763 	if (g1->clock_seq_hi_and_reserved != g2->clock_seq_hi_and_reserved)
1764 		return (0);
1765 	if (g1->clock_seq_low != g2->clock_seq_low)
1766 		return (0);
1767 
1768 	for (i = 0; i < 6; i++) {
1769 		if (g1->node_addr[i] != g2->node_addr[i])
1770 			return (0);
1771 	}
1772 	return (1);
1773 }
1774 
1775 static void
process_efi32(EFI_SYSTEM_TABLE32 * efi)1776 process_efi32(EFI_SYSTEM_TABLE32 *efi)
1777 {
1778 	uint32_t entries;
1779 	EFI_CONFIGURATION_TABLE32 *config;
1780 	efi_guid_t VendorGuid;
1781 	int i;
1782 
1783 	entries = efi->NumberOfTableEntries;
1784 	config = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1785 	    efi->ConfigurationTable;
1786 
1787 	for (i = 0; i < entries; i++) {
1788 		(void) memcpy(&VendorGuid, &config[i].VendorGuid,
1789 		    sizeof (VendorGuid));
1790 		if (dboot_same_guids(&VendorGuid, &smbios3)) {
1791 			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1792 			    config[i].VendorTable;
1793 		}
1794 		if (bi->bi_smbios == 0 &&
1795 		    dboot_same_guids(&VendorGuid, &smbios)) {
1796 			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1797 			    config[i].VendorTable;
1798 		}
1799 		if (dboot_same_guids(&VendorGuid, &acpi2)) {
1800 			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1801 			    config[i].VendorTable;
1802 		}
1803 		if (bi->bi_acpi_rsdp == 0 &&
1804 		    dboot_same_guids(&VendorGuid, &acpi1)) {
1805 			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1806 			    config[i].VendorTable;
1807 		}
1808 	}
1809 }
1810 
1811 static void
process_efi64(EFI_SYSTEM_TABLE64 * efi)1812 process_efi64(EFI_SYSTEM_TABLE64 *efi)
1813 {
1814 	uint64_t entries;
1815 	EFI_CONFIGURATION_TABLE64 *config;
1816 	efi_guid_t VendorGuid;
1817 	int i;
1818 
1819 	entries = efi->NumberOfTableEntries;
1820 	config = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1821 	    efi->ConfigurationTable;
1822 
1823 	for (i = 0; i < entries; i++) {
1824 		(void) memcpy(&VendorGuid, &config[i].VendorGuid,
1825 		    sizeof (VendorGuid));
1826 		if (dboot_same_guids(&VendorGuid, &smbios3)) {
1827 			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1828 			    config[i].VendorTable;
1829 		}
1830 		if (bi->bi_smbios == 0 &&
1831 		    dboot_same_guids(&VendorGuid, &smbios)) {
1832 			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1833 			    config[i].VendorTable;
1834 		}
1835 		/* Prefer acpi v2+ over v1. */
1836 		if (dboot_same_guids(&VendorGuid, &acpi2)) {
1837 			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1838 			    config[i].VendorTable;
1839 		}
1840 		if (bi->bi_acpi_rsdp == 0 &&
1841 		    dboot_same_guids(&VendorGuid, &acpi1)) {
1842 			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1843 			    config[i].VendorTable;
1844 		}
1845 	}
1846 }
1847 
1848 static void
dboot_multiboot_get_fwtables(void)1849 dboot_multiboot_get_fwtables(void)
1850 {
1851 	multiboot_tag_new_acpi_t *nacpitagp;
1852 	multiboot_tag_old_acpi_t *oacpitagp;
1853 	multiboot_tag_efi64_t *efi64tagp = NULL;
1854 	multiboot_tag_efi32_t *efi32tagp = NULL;
1855 
1856 	/* no fw tables from multiboot 1 */
1857 	if (multiboot_version != 2)
1858 		return;
1859 
1860 	efi64tagp = (multiboot_tag_efi64_t *)
1861 	    dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_EFI64);
1862 	if (efi64tagp != NULL) {
1863 		bi->bi_uefi_arch = XBI_UEFI_ARCH_64;
1864 		bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1865 		    efi64tagp->mb_pointer;
1866 		process_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
1867 		    efi64tagp->mb_pointer);
1868 	} else {
1869 		efi32tagp = (multiboot_tag_efi32_t *)
1870 		    dboot_multiboot2_find_tag(mb2_info,
1871 		    MULTIBOOT_TAG_TYPE_EFI32);
1872 		if (efi32tagp != NULL) {
1873 			bi->bi_uefi_arch = XBI_UEFI_ARCH_32;
1874 			bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1875 			    efi32tagp->mb_pointer;
1876 			process_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
1877 			    efi32tagp->mb_pointer);
1878 		}
1879 	}
1880 
1881 	/*
1882 	 * The multiboot2 info contains a copy of the RSDP; stash a pointer to
1883 	 * it (see find_rsdp() in fakebop).
1884 	 */
1885 	nacpitagp = (multiboot_tag_new_acpi_t *)
1886 	    dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_ACPI_NEW);
1887 	oacpitagp = (multiboot_tag_old_acpi_t *)
1888 	    dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_ACPI_OLD);
1889 
1890 	if (nacpitagp != NULL) {
1891 		bi->bi_acpi_rsdp_copy = (native_ptr_t)(uintptr_t)
1892 		    &nacpitagp->mb_rsdp[0];
1893 	} else if (oacpitagp != NULL) {
1894 		bi->bi_acpi_rsdp_copy = (native_ptr_t)(uintptr_t)
1895 		    &oacpitagp->mb_rsdp[0];
1896 	}
1897 }
1898 
1899 /* print out EFI version string with newline */
1900 static void
dboot_print_efi_version(uint32_t ver)1901 dboot_print_efi_version(uint32_t ver)
1902 {
1903 	int rev;
1904 
1905 	dboot_printf("%d.", EFI_REV_MAJOR(ver));
1906 
1907 	rev = EFI_REV_MINOR(ver);
1908 	if ((rev % 10) != 0) {
1909 		dboot_printf("%d.%d\n", rev / 10, rev % 10);
1910 	} else {
1911 		dboot_printf("%d\n", rev / 10);
1912 	}
1913 }
1914 
1915 static void
print_efi32(EFI_SYSTEM_TABLE32 * efi)1916 print_efi32(EFI_SYSTEM_TABLE32 *efi)
1917 {
1918 	uint16_t *data;
1919 	EFI_CONFIGURATION_TABLE32 *conf;
1920 	int i;
1921 
1922 	dboot_printf("EFI32 signature: %llx\n",
1923 	    (unsigned long long)efi->Hdr.Signature);
1924 	dboot_printf("EFI system version: ");
1925 	dboot_print_efi_version(efi->Hdr.Revision);
1926 	dboot_printf("EFI system vendor: ");
1927 	data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1928 	for (i = 0; data[i] != 0; i++)
1929 		dboot_printf("%c", (char)data[i]);
1930 	dboot_printf("\nEFI firmware revision: ");
1931 	dboot_print_efi_version(efi->FirmwareRevision);
1932 	dboot_printf("EFI system table number of entries: %d\n",
1933 	    efi->NumberOfTableEntries);
1934 	conf = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1935 	    efi->ConfigurationTable;
1936 	for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1937 		dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1938 		    conf[i].VendorGuid.time_low,
1939 		    conf[i].VendorGuid.time_mid,
1940 		    conf[i].VendorGuid.time_hi_and_version,
1941 		    conf[i].VendorGuid.clock_seq_hi_and_reserved,
1942 		    conf[i].VendorGuid.clock_seq_low);
1943 		dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1944 		    conf[i].VendorGuid.node_addr[0],
1945 		    conf[i].VendorGuid.node_addr[1],
1946 		    conf[i].VendorGuid.node_addr[2],
1947 		    conf[i].VendorGuid.node_addr[3],
1948 		    conf[i].VendorGuid.node_addr[4],
1949 		    conf[i].VendorGuid.node_addr[5]);
1950 	}
1951 }
1952 
1953 static void
print_efi64(EFI_SYSTEM_TABLE64 * efi)1954 print_efi64(EFI_SYSTEM_TABLE64 *efi)
1955 {
1956 	uint16_t *data;
1957 	EFI_CONFIGURATION_TABLE64 *conf;
1958 	int i;
1959 
1960 	dboot_printf("EFI64 signature: %llx\n",
1961 	    (unsigned long long)efi->Hdr.Signature);
1962 	dboot_printf("EFI system version: ");
1963 	dboot_print_efi_version(efi->Hdr.Revision);
1964 	dboot_printf("EFI system vendor: ");
1965 	data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1966 	for (i = 0; data[i] != 0; i++)
1967 		dboot_printf("%c", (char)data[i]);
1968 	dboot_printf("\nEFI firmware revision: ");
1969 	dboot_print_efi_version(efi->FirmwareRevision);
1970 	dboot_printf("EFI system table number of entries: %" PRIu64 "\n",
1971 	    efi->NumberOfTableEntries);
1972 	conf = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1973 	    efi->ConfigurationTable;
1974 	for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1975 		dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1976 		    conf[i].VendorGuid.time_low,
1977 		    conf[i].VendorGuid.time_mid,
1978 		    conf[i].VendorGuid.time_hi_and_version,
1979 		    conf[i].VendorGuid.clock_seq_hi_and_reserved,
1980 		    conf[i].VendorGuid.clock_seq_low);
1981 		dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1982 		    conf[i].VendorGuid.node_addr[0],
1983 		    conf[i].VendorGuid.node_addr[1],
1984 		    conf[i].VendorGuid.node_addr[2],
1985 		    conf[i].VendorGuid.node_addr[3],
1986 		    conf[i].VendorGuid.node_addr[4],
1987 		    conf[i].VendorGuid.node_addr[5]);
1988 	}
1989 }
1990 #endif /* !__xpv */
1991 
1992 /*
1993  * Simple memory allocator, allocates aligned physical memory.
1994  * Note that startup_kernel() only allocates memory, never frees.
1995  * Memory usage just grows in an upward direction.
1996  */
1997 static void *
do_mem_alloc(uint32_t size,uint32_t align)1998 do_mem_alloc(uint32_t size, uint32_t align)
1999 {
2000 	uint_t i;
2001 	uint64_t best;
2002 	uint64_t start;
2003 	uint64_t end;
2004 
2005 	/*
2006 	 * make sure size is a multiple of pagesize
2007 	 */
2008 	size = RNDUP(size, MMU_PAGESIZE);
2009 	next_avail_addr = RNDUP(next_avail_addr, align);
2010 
2011 	/*
2012 	 * XXPV fixme joe
2013 	 *
2014 	 * a really large bootarchive that causes you to run out of memory
2015 	 * may cause this to blow up
2016 	 */
2017 	/* LINTED E_UNEXPECTED_UINT_PROMOTION */
2018 	best = (uint64_t)-size;
2019 	for (i = 0; i < memlists_used; ++i) {
2020 		start = memlists[i].addr;
2021 #if defined(__xpv)
2022 		start += mfn_base;
2023 #endif
2024 		end = start + memlists[i].size;
2025 
2026 		/*
2027 		 * did we find the desired address?
2028 		 */
2029 		if (start <= next_avail_addr && next_avail_addr + size <= end) {
2030 			best = next_avail_addr;
2031 			goto done;
2032 		}
2033 
2034 		/*
2035 		 * if not is this address the best so far?
2036 		 */
2037 		if (start > next_avail_addr && start < best &&
2038 		    RNDUP(start, align) + size <= end)
2039 			best = RNDUP(start, align);
2040 	}
2041 
2042 	/*
2043 	 * We didn't find exactly the address we wanted, due to going off the
2044 	 * end of a memory region. Return the best found memory address.
2045 	 */
2046 done:
2047 	next_avail_addr = best + size;
2048 #if defined(__xpv)
2049 	if (next_avail_addr > scratch_end)
2050 		dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
2051 		    "0x%lx", (ulong_t)next_avail_addr,
2052 		    (ulong_t)scratch_end);
2053 #endif
2054 	(void) memset((void *)(uintptr_t)best, 0, size);
2055 	return ((void *)(uintptr_t)best);
2056 }
2057 
2058 void *
mem_alloc(uint32_t size)2059 mem_alloc(uint32_t size)
2060 {
2061 	return (do_mem_alloc(size, MMU_PAGESIZE));
2062 }
2063 
2064 
2065 /*
2066  * Build page tables to map all of memory used so far as well as the kernel.
2067  */
2068 static void
build_page_tables(void)2069 build_page_tables(void)
2070 {
2071 	uint32_t psize;
2072 	uint32_t level;
2073 	uint32_t off;
2074 	uint64_t start;
2075 #if !defined(__xpv)
2076 	uint32_t i;
2077 	uint64_t end;
2078 #endif	/* __xpv */
2079 
2080 	/*
2081 	 * If we're on metal, we need to create the top level pagetable.
2082 	 */
2083 #if defined(__xpv)
2084 	top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
2085 #else /* __xpv */
2086 	top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
2087 #endif /* __xpv */
2088 	DBG((uintptr_t)top_page_table);
2089 
2090 	/*
2091 	 * Determine if we'll use large mappings for kernel, then map it.
2092 	 */
2093 	if (largepage_support) {
2094 		psize = lpagesize;
2095 		level = 1;
2096 	} else {
2097 		psize = MMU_PAGESIZE;
2098 		level = 0;
2099 	}
2100 
2101 	DBG_MSG("Mapping kernel\n");
2102 	DBG(ktext_phys);
2103 	DBG(target_kernel_text);
2104 	DBG(ksize);
2105 	DBG(psize);
2106 	for (off = 0; off < ksize; off += psize)
2107 		map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
2108 
2109 	/*
2110 	 * The kernel will need a 1 page window to work with page tables
2111 	 */
2112 	bi->bi_pt_window = (native_ptr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
2113 	DBG(bi->bi_pt_window);
2114 	bi->bi_pte_to_pt_window =
2115 	    (native_ptr_t)(uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
2116 	DBG(bi->bi_pte_to_pt_window);
2117 
2118 #if defined(__xpv)
2119 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
2120 		/* If this is a domU we're done. */
2121 		DBG_MSG("\nPage tables constructed\n");
2122 		return;
2123 	}
2124 #endif /* __xpv */
2125 
2126 	/*
2127 	 * We need 1:1 mappings for the lower 1M of memory to access
2128 	 * BIOS tables used by a couple of drivers during boot.
2129 	 *
2130 	 * The following code works because our simple memory allocator
2131 	 * only grows usage in an upwards direction.
2132 	 *
2133 	 * Note that by this point in boot some mappings for low memory
2134 	 * may already exist because we've already accessed device in low
2135 	 * memory.  (Specifically the video frame buffer and keyboard
2136 	 * status ports.)  If we're booting on raw hardware then GRUB
2137 	 * created these mappings for us.  If we're booting under a
2138 	 * hypervisor then we went ahead and remapped these devices into
2139 	 * memory allocated within dboot itself.
2140 	 */
2141 	if (map_debug)
2142 		dboot_printf("1:1 map pa=0..1Meg\n");
2143 	for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
2144 #if defined(__xpv)
2145 		map_ma_at_va(start, start, 0);
2146 #else /* __xpv */
2147 		map_pa_at_va(start, start, 0);
2148 #endif /* __xpv */
2149 	}
2150 
2151 #if !defined(__xpv)
2152 
2153 	for (i = 0; i < memlists_used; ++i) {
2154 		start = memlists[i].addr;
2155 		end = start + memlists[i].size;
2156 
2157 		if (map_debug)
2158 			dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
2159 			    start, end);
2160 		while (start < end && start < next_avail_addr) {
2161 			map_pa_at_va(start, start, 0);
2162 			start += MMU_PAGESIZE;
2163 		}
2164 		if (start >= next_avail_addr)
2165 			break;
2166 	}
2167 
2168 	/*
2169 	 * Map framebuffer memory as PT_NOCACHE as this is memory from a
2170 	 * device and therefore must not be cached.
2171 	 */
2172 	if (fb != NULL && fb->framebuffer != 0) {
2173 		multiboot_tag_framebuffer_t *fb_tagp;
2174 		fb_tagp = (multiboot_tag_framebuffer_t *)(uintptr_t)
2175 		    fb->framebuffer;
2176 
2177 		start = fb_tagp->framebuffer_common.framebuffer_addr;
2178 		end = start + fb_tagp->framebuffer_common.framebuffer_height *
2179 		    fb_tagp->framebuffer_common.framebuffer_pitch;
2180 
2181 		if (map_debug)
2182 			dboot_printf("FB 1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
2183 			    start, end);
2184 		pte_bits |= PT_NOCACHE;
2185 		if (PAT_support != 0)
2186 			pte_bits |= PT_PAT_4K;
2187 
2188 		while (start < end) {
2189 			map_pa_at_va(start, start, 0);
2190 			start += MMU_PAGESIZE;
2191 		}
2192 		pte_bits &= ~PT_NOCACHE;
2193 		if (PAT_support != 0)
2194 			pte_bits &= ~PT_PAT_4K;
2195 	}
2196 #endif /* !__xpv */
2197 
2198 	DBG_MSG("\nPage tables constructed\n");
2199 }
2200 
2201 #define	NO_MULTIBOOT	\
2202 "multiboot is no longer used to boot the Solaris Operating System.\n\
2203 The grub entry should be changed to:\n\
2204 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
2205 module$ /platform/i86pc/$ISADIR/boot_archive\n\
2206 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
2207 
2208 static void
dboot_init_xboot_consinfo(void)2209 dboot_init_xboot_consinfo(void)
2210 {
2211 	bi = &boot_info;
2212 
2213 #if !defined(__xpv)
2214 	fb = &framebuffer;
2215 	bi->bi_framebuffer = (native_ptr_t)(uintptr_t)fb;
2216 
2217 	switch (multiboot_version) {
2218 	case 1:
2219 		dboot_multiboot1_xboot_consinfo();
2220 		break;
2221 	case 2:
2222 		dboot_multiboot2_xboot_consinfo();
2223 		break;
2224 	default:
2225 		dboot_panic("Unknown multiboot version: %d\n",
2226 		    multiboot_version);
2227 		break;
2228 	}
2229 	dboot_find_console_modules();
2230 #endif
2231 }
2232 
2233 /*
2234  * Set up basic data from the boot loader.
2235  * The load_addr is part of AOUT kludge setup in dboot_grub.s, to support
2236  * 32-bit dboot code setup used to set up and start 64-bit kernel.
2237  * AOUT kludge does allow 32-bit boot loader, such as grub1, to load and
2238  * start 64-bit illumos kernel.
2239  */
2240 static void
dboot_loader_init(void)2241 dboot_loader_init(void)
2242 {
2243 #if !defined(__xpv)
2244 	mb_info = NULL;
2245 	mb2_info = NULL;
2246 
2247 	switch (mb_magic) {
2248 	case MB_BOOTLOADER_MAGIC:
2249 		multiboot_version = 1;
2250 		mb_info = (multiboot_info_t *)(uintptr_t)mb_addr;
2251 #if defined(_BOOT_TARGET_amd64)
2252 		load_addr = mb_header.load_addr;
2253 #endif
2254 		break;
2255 
2256 	case MULTIBOOT2_BOOTLOADER_MAGIC:
2257 		multiboot_version = 2;
2258 		mb2_info = (multiboot2_info_header_t *)(uintptr_t)mb_addr;
2259 #if defined(_BOOT_TARGET_amd64)
2260 		load_addr = mb2_load_addr;
2261 #endif
2262 		break;
2263 
2264 	default:
2265 		dboot_panic("Unknown bootloader magic: 0x%x\n", mb_magic);
2266 		break;
2267 	}
2268 #endif	/* !defined(__xpv) */
2269 }
2270 
2271 /* Extract the kernel command line from [multi]boot information. */
2272 static char *
dboot_loader_cmdline(void)2273 dboot_loader_cmdline(void)
2274 {
2275 	char *line = NULL;
2276 
2277 #if defined(__xpv)
2278 	line = (char *)xen_info->cmd_line;
2279 #else /* __xpv */
2280 
2281 	switch (multiboot_version) {
2282 	case 1:
2283 		if (mb_info->flags & MB_INFO_CMDLINE)
2284 			line = (char *)mb_info->cmdline;
2285 		break;
2286 
2287 	case 2:
2288 		line = dboot_multiboot2_cmdline(mb2_info);
2289 		break;
2290 
2291 	default:
2292 		dboot_panic("Unknown multiboot version: %d\n",
2293 		    multiboot_version);
2294 		break;
2295 	}
2296 
2297 #endif /* __xpv */
2298 
2299 	/*
2300 	 * Make sure we have valid pointer so the string operations
2301 	 * will not crash us.
2302 	 */
2303 	if (line == NULL)
2304 		line = "";
2305 
2306 	return (line);
2307 }
2308 
2309 static char *
dboot_loader_name(void)2310 dboot_loader_name(void)
2311 {
2312 #if defined(__xpv)
2313 	return (NULL);
2314 #else /* __xpv */
2315 	multiboot_tag_string_t *tag;
2316 
2317 	switch (multiboot_version) {
2318 	case 1:
2319 		return ((char *)(uintptr_t)mb_info->boot_loader_name);
2320 
2321 	case 2:
2322 		tag = dboot_multiboot2_find_tag(mb2_info,
2323 		    MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME);
2324 		return (tag->mb_string);
2325 	default:
2326 		dboot_panic("Unknown multiboot version: %d\n",
2327 		    multiboot_version);
2328 		break;
2329 	}
2330 
2331 	return (NULL);
2332 #endif /* __xpv */
2333 }
2334 
2335 /*
2336  * startup_kernel has a pretty simple job. It builds pagetables which reflect
2337  * 1:1 mappings for all memory in use. It then also adds mappings for
2338  * the kernel nucleus at virtual address of target_kernel_text using large page
2339  * mappings. The page table pages are also accessible at 1:1 mapped
2340  * virtual addresses.
2341  */
2342 /*ARGSUSED*/
2343 void
startup_kernel(void)2344 startup_kernel(void)
2345 {
2346 	char *cmdline;
2347 	char *bootloader;
2348 #if defined(__xpv)
2349 	physdev_set_iopl_t set_iopl;
2350 #endif /* __xpv */
2351 
2352 	if (dboot_debug == 1)
2353 		bcons_init(NULL);	/* Set very early console to ttya. */
2354 	dboot_loader_init();
2355 	/*
2356 	 * At this point we are executing in a 32 bit real mode.
2357 	 */
2358 
2359 	bootloader = dboot_loader_name();
2360 	cmdline = dboot_loader_cmdline();
2361 
2362 #if defined(__xpv)
2363 	/*
2364 	 * For dom0, before we initialize the console subsystem we'll
2365 	 * need to enable io operations, so set I/O priveldge level to 1.
2366 	 */
2367 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
2368 		set_iopl.iopl = 1;
2369 		(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
2370 	}
2371 #endif /* __xpv */
2372 
2373 	dboot_init_xboot_consinfo();
2374 	bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
2375 	bcons_init(bi);		/* Now we can set the real console. */
2376 
2377 	prom_debug = (find_boot_prop("prom_debug") != NULL);
2378 	map_debug = (find_boot_prop("map_debug") != NULL);
2379 
2380 #if !defined(__xpv)
2381 	dboot_multiboot_get_fwtables();
2382 #endif
2383 	DBG_MSG("\n\nillumos prekernel set: ");
2384 	DBG_MSG(cmdline);
2385 	DBG_MSG("\n");
2386 
2387 	if (bootloader != NULL && prom_debug) {
2388 		dboot_printf("Kernel loaded by: %s\n", bootloader);
2389 #if !defined(__xpv)
2390 		dboot_printf("Using multiboot %d boot protocol.\n",
2391 		    multiboot_version);
2392 #endif
2393 	}
2394 
2395 	if (strstr(cmdline, "multiboot") != NULL) {
2396 		dboot_panic(NO_MULTIBOOT);
2397 	}
2398 
2399 	DBG((uintptr_t)bi);
2400 #if !defined(__xpv)
2401 	DBG((uintptr_t)mb_info);
2402 	DBG((uintptr_t)mb2_info);
2403 	if (mb2_info != NULL)
2404 		DBG(mb2_info->mbi_total_size);
2405 	DBG(bi->bi_acpi_rsdp);
2406 	DBG(bi->bi_acpi_rsdp_copy);
2407 	DBG(bi->bi_smbios);
2408 	DBG(bi->bi_uefi_arch);
2409 	DBG(bi->bi_uefi_systab);
2410 
2411 	if (bi->bi_uefi_systab && prom_debug) {
2412 		if (bi->bi_uefi_arch == XBI_UEFI_ARCH_64) {
2413 			print_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
2414 			    bi->bi_uefi_systab);
2415 		} else {
2416 			print_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
2417 			    bi->bi_uefi_systab);
2418 		}
2419 	}
2420 #endif
2421 
2422 	/*
2423 	 * Need correct target_kernel_text value
2424 	 */
2425 	target_kernel_text = KERNEL_TEXT;
2426 	DBG(target_kernel_text);
2427 
2428 #if defined(__xpv)
2429 
2430 	/*
2431 	 * XXPV	Derive this stuff from CPUID / what the hypervisor has enabled
2432 	 */
2433 
2434 #if defined(_BOOT_TARGET_amd64)
2435 	/*
2436 	 * 64-bit hypervisor.
2437 	 */
2438 	amd64_support = 1;
2439 	pae_support = 1;
2440 
2441 #else	/* _BOOT_TARGET_amd64 */
2442 
2443 	/*
2444 	 * See if we are running on a PAE Hypervisor
2445 	 */
2446 	{
2447 		xen_capabilities_info_t caps;
2448 
2449 		if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
2450 			dboot_panic("HYPERVISOR_xen_version(caps) failed");
2451 		caps[sizeof (caps) - 1] = 0;
2452 		if (prom_debug)
2453 			dboot_printf("xen capabilities %s\n", caps);
2454 		if (strstr(caps, "x86_32p") != NULL)
2455 			pae_support = 1;
2456 	}
2457 
2458 #endif	/* _BOOT_TARGET_amd64 */
2459 	{
2460 		xen_platform_parameters_t p;
2461 
2462 		if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
2463 			dboot_panic("HYPERVISOR_xen_version(parms) failed");
2464 		DBG(p.virt_start);
2465 		mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
2466 	}
2467 
2468 	/*
2469 	 * The hypervisor loads stuff starting at 1Gig
2470 	 */
2471 	mfn_base = ONE_GIG;
2472 	DBG(mfn_base);
2473 
2474 	/*
2475 	 * enable writable page table mode for the hypervisor
2476 	 */
2477 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2478 	    VMASST_TYPE_writable_pagetables) < 0)
2479 		dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
2480 
2481 	/*
2482 	 * check for NX support
2483 	 */
2484 	if (pae_support) {
2485 		uint32_t eax = 0x80000000;
2486 		uint32_t edx = get_cpuid_edx(&eax);
2487 
2488 		if (eax >= 0x80000001) {
2489 			eax = 0x80000001;
2490 			edx = get_cpuid_edx(&eax);
2491 			if (edx & CPUID_AMD_EDX_NX)
2492 				NX_support = 1;
2493 		}
2494 	}
2495 
2496 	/*
2497 	 * check for PAT support
2498 	 */
2499 	{
2500 		uint32_t eax = 1;
2501 		uint32_t edx = get_cpuid_edx(&eax);
2502 
2503 		if (edx & CPUID_INTC_EDX_PAT)
2504 			PAT_support = 1;
2505 	}
2506 #if !defined(_BOOT_TARGET_amd64)
2507 
2508 	/*
2509 	 * The 32-bit hypervisor uses segmentation to protect itself from
2510 	 * guests. This means when a guest attempts to install a flat 4GB
2511 	 * code or data descriptor the 32-bit hypervisor will protect itself
2512 	 * by silently shrinking the segment such that if the guest attempts
2513 	 * any access where the hypervisor lives a #gp fault is generated.
2514 	 * The problem is that some applications expect a full 4GB flat
2515 	 * segment for their current thread pointer and will use negative
2516 	 * offset segment wrap around to access data. TLS support in linux
2517 	 * brand is one example of this.
2518 	 *
2519 	 * The 32-bit hypervisor can catch the #gp fault in these cases
2520 	 * and emulate the access without passing the #gp fault to the guest
2521 	 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
2522 	 * Seems like this should have been the default.
2523 	 * Either way, we want the hypervisor -- and not Solaris -- to deal
2524 	 * to deal with emulating these accesses.
2525 	 */
2526 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2527 	    VMASST_TYPE_4gb_segments) < 0)
2528 		dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
2529 #endif	/* !_BOOT_TARGET_amd64 */
2530 
2531 #else	/* __xpv */
2532 
2533 	/*
2534 	 * use cpuid to enable MMU features
2535 	 */
2536 	if (have_cpuid()) {
2537 		uint32_t eax, edx;
2538 
2539 		eax = 1;
2540 		edx = get_cpuid_edx(&eax);
2541 		if (edx & CPUID_INTC_EDX_PSE)
2542 			largepage_support = 1;
2543 		if (edx & CPUID_INTC_EDX_PGE)
2544 			pge_support = 1;
2545 		if (edx & CPUID_INTC_EDX_PAE)
2546 			pae_support = 1;
2547 		if (edx & CPUID_INTC_EDX_PAT)
2548 			PAT_support = 1;
2549 
2550 		eax = 0x80000000;
2551 		edx = get_cpuid_edx(&eax);
2552 		if (eax >= 0x80000001) {
2553 			eax = 0x80000001;
2554 			edx = get_cpuid_edx(&eax);
2555 			if (edx & CPUID_AMD_EDX_LM)
2556 				amd64_support = 1;
2557 			if (edx & CPUID_AMD_EDX_NX)
2558 				NX_support = 1;
2559 		}
2560 	} else {
2561 		dboot_printf("cpuid not supported\n");
2562 	}
2563 #endif /* __xpv */
2564 
2565 
2566 #if defined(_BOOT_TARGET_amd64)
2567 	if (amd64_support == 0)
2568 		dboot_panic("long mode not supported, rebooting");
2569 	else if (pae_support == 0)
2570 		dboot_panic("long mode, but no PAE; rebooting");
2571 #else
2572 	/*
2573 	 * Allow the command line to over-ride use of PAE for 32 bit.
2574 	 */
2575 	if (strstr(cmdline, "disablePAE=true") != NULL) {
2576 		pae_support = 0;
2577 		NX_support = 0;
2578 		amd64_support = 0;
2579 	}
2580 #endif
2581 
2582 	/*
2583 	 * initialize the simple memory allocator
2584 	 */
2585 	init_mem_alloc();
2586 
2587 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
2588 	/*
2589 	 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
2590 	 */
2591 	if (max_mem < FOUR_GIG && NX_support == 0)
2592 		pae_support = 0;
2593 #endif
2594 
2595 	/*
2596 	 * configure mmu information
2597 	 */
2598 	if (pae_support) {
2599 		shift_amt = shift_amt_pae;
2600 		ptes_per_table = 512;
2601 		pte_size = 8;
2602 		lpagesize = TWO_MEG;
2603 #if defined(_BOOT_TARGET_amd64)
2604 		top_level = 3;
2605 #else
2606 		top_level = 2;
2607 #endif
2608 	} else {
2609 		pae_support = 0;
2610 		NX_support = 0;
2611 		shift_amt = shift_amt_nopae;
2612 		ptes_per_table = 1024;
2613 		pte_size = 4;
2614 		lpagesize = FOUR_MEG;
2615 		top_level = 1;
2616 	}
2617 
2618 	DBG(PAT_support);
2619 	DBG(pge_support);
2620 	DBG(NX_support);
2621 	DBG(largepage_support);
2622 	DBG(amd64_support);
2623 	DBG(top_level);
2624 	DBG(pte_size);
2625 	DBG(ptes_per_table);
2626 	DBG(lpagesize);
2627 
2628 #if defined(__xpv)
2629 	ktext_phys = ONE_GIG;		/* from UNIX Mapfile */
2630 #else
2631 	ktext_phys = FOUR_MEG;		/* from UNIX Mapfile */
2632 #endif
2633 
2634 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
2635 	/*
2636 	 * For grub, copy kernel bits from the ELF64 file to final place.
2637 	 */
2638 	DBG_MSG("\nAllocating nucleus pages.\n");
2639 	ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
2640 
2641 	if (ktext_phys == 0)
2642 		dboot_panic("failed to allocate aligned kernel memory");
2643 	DBG(load_addr);
2644 	if (dboot_elfload64(load_addr) != 0)
2645 		dboot_panic("failed to parse kernel ELF image, rebooting");
2646 #endif
2647 
2648 	DBG(ktext_phys);
2649 
2650 	/*
2651 	 * Allocate page tables.
2652 	 */
2653 	build_page_tables();
2654 
2655 	/*
2656 	 * return to assembly code to switch to running kernel
2657 	 */
2658 	entry_addr_low = (uint32_t)target_kernel_text;
2659 	DBG(entry_addr_low);
2660 	bi->bi_use_largepage = largepage_support;
2661 	bi->bi_use_pae = pae_support;
2662 	bi->bi_use_pge = pge_support;
2663 	bi->bi_use_nx = NX_support;
2664 
2665 #if defined(__xpv)
2666 
2667 	bi->bi_next_paddr = next_avail_addr - mfn_base;
2668 	DBG(bi->bi_next_paddr);
2669 	bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2670 	DBG(bi->bi_next_vaddr);
2671 
2672 	/*
2673 	 * unmap unused pages in start area to make them available for DMA
2674 	 */
2675 	while (next_avail_addr < scratch_end) {
2676 		(void) HYPERVISOR_update_va_mapping(next_avail_addr,
2677 		    0, UVMF_INVLPG | UVMF_LOCAL);
2678 		next_avail_addr += MMU_PAGESIZE;
2679 	}
2680 
2681 	bi->bi_xen_start_info = (native_ptr_t)(uintptr_t)xen_info;
2682 	DBG((uintptr_t)HYPERVISOR_shared_info);
2683 	bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
2684 	bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
2685 
2686 #else /* __xpv */
2687 
2688 	bi->bi_next_paddr = next_avail_addr;
2689 	DBG(bi->bi_next_paddr);
2690 	bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2691 	DBG(bi->bi_next_vaddr);
2692 	bi->bi_mb_version = multiboot_version;
2693 
2694 	switch (multiboot_version) {
2695 	case 1:
2696 		bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb_info;
2697 		break;
2698 	case 2:
2699 		bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb2_info;
2700 		break;
2701 	default:
2702 		dboot_panic("Unknown multiboot version: %d\n",
2703 		    multiboot_version);
2704 		break;
2705 	}
2706 	bi->bi_top_page_table = (uintptr_t)top_page_table;
2707 
2708 #endif /* __xpv */
2709 
2710 	bi->bi_kseg_size = FOUR_MEG;
2711 	DBG(bi->bi_kseg_size);
2712 
2713 #ifndef __xpv
2714 	if (map_debug)
2715 		dump_tables();
2716 #endif
2717 
2718 	DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
2719 
2720 #ifndef __xpv
2721 	/* Update boot info with FB data */
2722 	fb->cursor.origin.x = fb_info.cursor.origin.x;
2723 	fb->cursor.origin.y = fb_info.cursor.origin.y;
2724 	fb->cursor.pos.x = fb_info.cursor.pos.x;
2725 	fb->cursor.pos.y = fb_info.cursor.pos.y;
2726 	fb->cursor.visible = fb_info.cursor.visible;
2727 #endif
2728 }
2729