xref: /illumos-gate/usr/src/uts/i86pc/dboot/dboot_startkern.c (revision 2dc5cbd37af38054ac6aab88a3ecee22a13ed2eb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright 2013 Joyent, Inc.  All rights reserved.
27  */
28 
29 
30 #include <sys/types.h>
31 #include <sys/machparam.h>
32 #include <sys/x86_archext.h>
33 #include <sys/systm.h>
34 #include <sys/mach_mmu.h>
35 #include <sys/multiboot.h>
36 #include <sys/multiboot2.h>
37 #include <sys/multiboot2_impl.h>
38 #include <sys/sysmacros.h>
39 #include <sys/sha1.h>
40 #include <util/string.h>
41 #include <util/strtolctype.h>
42 #include <sys/efi.h>
43 
44 /*
45  * Compile time debug knob. We do not have any early mechanism to control it
46  * as the boot is the earliest mechanism we have, and we do not want to have
47  * it being switched on by default.
48  */
49 int dboot_debug = 0;
50 
51 #if defined(__xpv)
52 
53 #include <sys/hypervisor.h>
54 uintptr_t xen_virt_start;
55 pfn_t *mfn_to_pfn_mapping;
56 
57 #else /* !__xpv */
58 
59 extern multiboot_header_t mb_header;
60 extern uint32_t mb2_load_addr;
61 extern int have_cpuid(void);
62 
63 #endif /* !__xpv */
64 
65 #include <sys/inttypes.h>
66 #include <sys/bootinfo.h>
67 #include <sys/mach_mmu.h>
68 #include <sys/boot_console.h>
69 
70 #include "dboot_asm.h"
71 #include "dboot_printf.h"
72 #include "dboot_xboot.h"
73 #include "dboot_elfload.h"
74 
75 #define	SHA1_ASCII_LENGTH	(SHA1_DIGEST_LENGTH * 2)
76 
77 /*
78  * This file contains code that runs to transition us from either a multiboot
79  * compliant loader (32 bit non-paging) or a XPV domain loader to
80  * regular kernel execution. Its task is to setup the kernel memory image
81  * and page tables.
82  *
83  * The code executes as:
84  *	- 32 bits under GRUB (for 32 or 64 bit Solaris)
85  *	- a 32 bit program for the 32-bit PV hypervisor
86  *	- a 64 bit program for the 64-bit PV hypervisor (at least for now)
87  *
88  * Under the PV hypervisor, we must create mappings for any memory beyond the
89  * initial start of day allocation (such as the kernel itself).
90  *
91  * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
92  * Since we are running in real mode, so all such memory is accessible.
93  */
94 
95 /*
96  * Standard bits used in PTE (page level) and PTP (internal levels)
97  */
98 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
99 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
100 
101 /*
102  * This is the target addresses (physical) where the kernel text and data
103  * nucleus pages will be unpacked. On the hypervisor this is actually a
104  * virtual address.
105  */
106 paddr_t ktext_phys;
107 uint32_t ksize = 2 * FOUR_MEG;	/* kernel nucleus is 8Meg */
108 
109 static uint64_t target_kernel_text;	/* value to use for KERNEL_TEXT */
110 
111 /*
112  * The stack is setup in assembler before entering startup_kernel()
113  */
114 char stack_space[STACK_SIZE];
115 
116 /*
117  * Used to track physical memory allocation
118  */
119 static paddr_t next_avail_addr = 0;
120 
121 #if defined(__xpv)
122 /*
123  * Additional information needed for hypervisor memory allocation.
124  * Only memory up to scratch_end is mapped by page tables.
125  * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
126  * to derive a pfn from a pointer, you subtract mfn_base.
127  */
128 
129 static paddr_t scratch_end = 0;	/* we can't write all of mem here */
130 static paddr_t mfn_base;		/* addr corresponding to mfn_list[0] */
131 start_info_t *xen_info;
132 
133 #else	/* __xpv */
134 
135 /*
136  * If on the metal, then we have a multiboot loader.
137  */
138 uint32_t mb_magic;			/* magic from boot loader */
139 uint32_t mb_addr;			/* multiboot info package from loader */
140 int multiboot_version;
141 multiboot_info_t *mb_info;
142 multiboot2_info_header_t *mb2_info;
143 multiboot_tag_mmap_t *mb2_mmap_tagp;
144 int num_entries;			/* mmap entry count */
145 boolean_t num_entries_set;		/* is mmap entry count set */
146 uintptr_t load_addr;
147 
148 /* can not be automatic variables because of alignment */
149 static efi_guid_t smbios3 = SMBIOS3_TABLE_GUID;
150 static efi_guid_t smbios = SMBIOS_TABLE_GUID;
151 static efi_guid_t acpi2 = EFI_ACPI_TABLE_GUID;
152 static efi_guid_t acpi1 = ACPI_10_TABLE_GUID;
153 #endif	/* __xpv */
154 
155 /*
156  * This contains information passed to the kernel
157  */
158 struct xboot_info boot_info[2];	/* extra space to fix alignement for amd64 */
159 struct xboot_info *bi;
160 
161 /*
162  * Page table and memory stuff.
163  */
164 static paddr_t max_mem;			/* maximum memory address */
165 
166 /*
167  * Information about processor MMU
168  */
169 int amd64_support = 0;
170 int largepage_support = 0;
171 int pae_support = 0;
172 int pge_support = 0;
173 int NX_support = 0;
174 
175 /*
176  * Low 32 bits of kernel entry address passed back to assembler.
177  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
178  */
179 uint32_t entry_addr_low;
180 
181 /*
182  * Memlists for the kernel. We shouldn't need a lot of these.
183  */
184 #define	MAX_MEMLIST (50)
185 struct boot_memlist memlists[MAX_MEMLIST];
186 uint_t memlists_used = 0;
187 struct boot_memlist pcimemlists[MAX_MEMLIST];
188 uint_t pcimemlists_used = 0;
189 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
190 uint_t rsvdmemlists_used = 0;
191 
192 /*
193  * This should match what's in the bootloader.  It's arbitrary, but GRUB
194  * in particular has limitations on how much space it can use before it
195  * stops working properly.  This should be enough.
196  */
197 struct boot_modules modules[MAX_BOOT_MODULES];
198 uint_t modules_used = 0;
199 
200 #ifdef __xpv
201 /*
202  * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
203  * definition in Xen source.
204  */
205 typedef struct {
206 	uint32_t	base_addr_low;
207 	uint32_t	base_addr_high;
208 	uint32_t	length_low;
209 	uint32_t	length_high;
210 	uint32_t	type;
211 } mmap_t;
212 
213 /*
214  * There is 512KB of scratch area after the boot stack page.
215  * We'll use that for everything except the kernel nucleus pages which are too
216  * big to fit there and are allocated last anyway.
217  */
218 #define	MAXMAPS	100
219 static mmap_t map_buffer[MAXMAPS];
220 #else
221 typedef mb_memory_map_t mmap_t;
222 #endif
223 
224 /*
225  * Debugging macros
226  */
227 uint_t prom_debug = 0;
228 uint_t map_debug = 0;
229 
230 static char noname[2] = "-";
231 
232 /*
233  * Either hypervisor-specific or grub-specific code builds the initial
234  * memlists. This code does the sort/merge/link for final use.
235  */
236 static void
237 sort_physinstall(void)
238 {
239 	int i;
240 #if !defined(__xpv)
241 	int j;
242 	struct boot_memlist tmp;
243 
244 	/*
245 	 * Now sort the memlists, in case they weren't in order.
246 	 * Yeah, this is a bubble sort; small, simple and easy to get right.
247 	 */
248 	DBG_MSG("Sorting phys-installed list\n");
249 	for (j = memlists_used - 1; j > 0; --j) {
250 		for (i = 0; i < j; ++i) {
251 			if (memlists[i].addr < memlists[i + 1].addr)
252 				continue;
253 			tmp = memlists[i];
254 			memlists[i] = memlists[i + 1];
255 			memlists[i + 1] = tmp;
256 		}
257 	}
258 
259 	/*
260 	 * Merge any memlists that don't have holes between them.
261 	 */
262 	for (i = 0; i <= memlists_used - 1; ++i) {
263 		if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
264 			continue;
265 
266 		if (prom_debug)
267 			dboot_printf(
268 			    "merging mem segs %" PRIx64 "...%" PRIx64
269 			    " w/ %" PRIx64 "...%" PRIx64 "\n",
270 			    memlists[i].addr,
271 			    memlists[i].addr + memlists[i].size,
272 			    memlists[i + 1].addr,
273 			    memlists[i + 1].addr + memlists[i + 1].size);
274 
275 		memlists[i].size += memlists[i + 1].size;
276 		for (j = i + 1; j < memlists_used - 1; ++j)
277 			memlists[j] = memlists[j + 1];
278 		--memlists_used;
279 		DBG(memlists_used);
280 		--i;	/* after merging we need to reexamine, so do this */
281 	}
282 #endif	/* __xpv */
283 
284 	if (prom_debug) {
285 		dboot_printf("\nFinal memlists:\n");
286 		for (i = 0; i < memlists_used; ++i) {
287 			dboot_printf("\t%d: addr=%" PRIx64 " size=%"
288 			    PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
289 		}
290 	}
291 
292 	/*
293 	 * link together the memlists with native size pointers
294 	 */
295 	memlists[0].next = 0;
296 	memlists[0].prev = 0;
297 	for (i = 1; i < memlists_used; ++i) {
298 		memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
299 		memlists[i].next = 0;
300 		memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
301 	}
302 	bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
303 	DBG(bi->bi_phys_install);
304 }
305 
306 /*
307  * build bios reserved memlists
308  */
309 static void
310 build_rsvdmemlists(void)
311 {
312 	int i;
313 
314 	rsvdmemlists[0].next = 0;
315 	rsvdmemlists[0].prev = 0;
316 	for (i = 1; i < rsvdmemlists_used; ++i) {
317 		rsvdmemlists[i].prev =
318 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
319 		rsvdmemlists[i].next = 0;
320 		rsvdmemlists[i - 1].next =
321 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
322 	}
323 	bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
324 	DBG(bi->bi_rsvdmem);
325 }
326 
327 #if defined(__xpv)
328 
329 /*
330  * halt on the hypervisor after a delay to drain console output
331  */
332 void
333 dboot_halt(void)
334 {
335 	uint_t i = 10000;
336 
337 	while (--i)
338 		(void) HYPERVISOR_yield();
339 	(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
340 }
341 
342 /*
343  * From a machine address, find the corresponding pseudo-physical address.
344  * Pseudo-physical address are contiguous and run from mfn_base in each VM.
345  * Machine addresses are the real underlying hardware addresses.
346  * These are needed for page table entries. Note that this routine is
347  * poorly protected. A bad value of "ma" will cause a page fault.
348  */
349 paddr_t
350 ma_to_pa(maddr_t ma)
351 {
352 	ulong_t pgoff = ma & MMU_PAGEOFFSET;
353 	ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
354 	paddr_t pa;
355 
356 	if (pfn >= xen_info->nr_pages)
357 		return (-(paddr_t)1);
358 	pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
359 #ifdef DEBUG
360 	if (ma != pa_to_ma(pa))
361 		dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
362 		    "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
363 #endif
364 	return (pa);
365 }
366 
367 /*
368  * From a pseudo-physical address, find the corresponding machine address.
369  */
370 maddr_t
371 pa_to_ma(paddr_t pa)
372 {
373 	pfn_t pfn;
374 	ulong_t mfn;
375 
376 	pfn = mmu_btop(pa - mfn_base);
377 	if (pa < mfn_base || pfn >= xen_info->nr_pages)
378 		dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
379 	mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
380 #ifdef DEBUG
381 	if (mfn_to_pfn_mapping[mfn] != pfn)
382 		dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
383 		    pfn, mfn, mfn_to_pfn_mapping[mfn]);
384 #endif
385 	return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
386 }
387 
388 #endif	/* __xpv */
389 
390 x86pte_t
391 get_pteval(paddr_t table, uint_t index)
392 {
393 	if (pae_support)
394 		return (((x86pte_t *)(uintptr_t)table)[index]);
395 	return (((x86pte32_t *)(uintptr_t)table)[index]);
396 }
397 
398 /*ARGSUSED*/
399 void
400 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
401 {
402 #ifdef __xpv
403 	mmu_update_t t;
404 	maddr_t mtable = pa_to_ma(table);
405 	int retcnt;
406 
407 	t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
408 	t.val = pteval;
409 	if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
410 		dboot_panic("HYPERVISOR_mmu_update() failed");
411 #else /* __xpv */
412 	uintptr_t tab_addr = (uintptr_t)table;
413 
414 	if (pae_support)
415 		((x86pte_t *)tab_addr)[index] = pteval;
416 	else
417 		((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
418 	if (level == top_level && level == 2)
419 		reload_cr3();
420 #endif /* __xpv */
421 }
422 
423 paddr_t
424 make_ptable(x86pte_t *pteval, uint_t level)
425 {
426 	paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
427 
428 	if (level == top_level && level == 2)
429 		*pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
430 	else
431 		*pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
432 
433 #ifdef __xpv
434 	/* Remove write permission to the new page table. */
435 	if (HYPERVISOR_update_va_mapping(new_table,
436 	    *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
437 		dboot_panic("HYP_update_va_mapping error");
438 #endif
439 
440 	if (map_debug)
441 		dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
442 		    PRIx64 "\n", level, (ulong_t)new_table, *pteval);
443 	return (new_table);
444 }
445 
446 x86pte_t *
447 map_pte(paddr_t table, uint_t index)
448 {
449 	return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
450 }
451 
452 /*
453  * dump out the contents of page tables...
454  */
455 static void
456 dump_tables(void)
457 {
458 	uint_t save_index[4];	/* for recursion */
459 	char *save_table[4];	/* for recursion */
460 	uint_t	l;
461 	uint64_t va;
462 	uint64_t pgsize;
463 	int index;
464 	int i;
465 	x86pte_t pteval;
466 	char *table;
467 	static char *tablist = "\t\t\t";
468 	char *tabs = tablist + 3 - top_level;
469 	uint_t pa, pa1;
470 #if !defined(__xpv)
471 #define	maddr_t paddr_t
472 #endif /* !__xpv */
473 
474 	dboot_printf("Finished pagetables:\n");
475 	table = (char *)(uintptr_t)top_page_table;
476 	l = top_level;
477 	va = 0;
478 	for (index = 0; index < ptes_per_table; ++index) {
479 		pgsize = 1ull << shift_amt[l];
480 		if (pae_support)
481 			pteval = ((x86pte_t *)table)[index];
482 		else
483 			pteval = ((x86pte32_t *)table)[index];
484 		if (pteval == 0)
485 			goto next_entry;
486 
487 		dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
488 		    tabs + l, (void *)table, index, (uint64_t)pteval, va);
489 		pa = ma_to_pa(pteval & MMU_PAGEMASK);
490 		dboot_printf(" physaddr=%x\n", pa);
491 
492 		/*
493 		 * Don't try to walk hypervisor private pagetables
494 		 */
495 		if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
496 			save_table[l] = table;
497 			save_index[l] = index;
498 			--l;
499 			index = -1;
500 			table = (char *)(uintptr_t)
501 			    ma_to_pa(pteval & MMU_PAGEMASK);
502 			goto recursion;
503 		}
504 
505 		/*
506 		 * shorten dump for consecutive mappings
507 		 */
508 		for (i = 1; index + i < ptes_per_table; ++i) {
509 			if (pae_support)
510 				pteval = ((x86pte_t *)table)[index + i];
511 			else
512 				pteval = ((x86pte32_t *)table)[index + i];
513 			if (pteval == 0)
514 				break;
515 			pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
516 			if (pa1 != pa + i * pgsize)
517 				break;
518 		}
519 		if (i > 2) {
520 			dboot_printf("%s...\n", tabs + l);
521 			va += pgsize * (i - 2);
522 			index += i - 2;
523 		}
524 next_entry:
525 		va += pgsize;
526 		if (l == 3 && index == 256)	/* VA hole */
527 			va = 0xffff800000000000ull;
528 recursion:
529 		;
530 	}
531 	if (l < top_level) {
532 		++l;
533 		index = save_index[l];
534 		table = save_table[l];
535 		goto recursion;
536 	}
537 }
538 
539 /*
540  * Add a mapping for the machine page at the given virtual address.
541  */
542 static void
543 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
544 {
545 	x86pte_t *ptep;
546 	x86pte_t pteval;
547 
548 	pteval = ma | pte_bits;
549 	if (level > 0)
550 		pteval |= PT_PAGESIZE;
551 	if (va >= target_kernel_text && pge_support)
552 		pteval |= PT_GLOBAL;
553 
554 	if (map_debug && ma != va)
555 		dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
556 		    " pte=0x%" PRIx64 " l=%d\n",
557 		    (uint64_t)ma, (uint64_t)va, pteval, level);
558 
559 #if defined(__xpv)
560 	/*
561 	 * see if we can avoid find_pte() on the hypervisor
562 	 */
563 	if (HYPERVISOR_update_va_mapping(va, pteval,
564 	    UVMF_INVLPG | UVMF_LOCAL) == 0)
565 		return;
566 #endif
567 
568 	/*
569 	 * Find the pte that will map this address. This creates any
570 	 * missing intermediate level page tables
571 	 */
572 	ptep = find_pte(va, NULL, level, 0);
573 
574 	/*
575 	 * When paravirtualized, we must use hypervisor calls to modify the
576 	 * PTE, since paging is active. On real hardware we just write to
577 	 * the pagetables which aren't in use yet.
578 	 */
579 #if defined(__xpv)
580 	ptep = ptep;	/* shut lint up */
581 	if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
582 		dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
583 		    " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
584 		    (uint64_t)va, level, (uint64_t)ma, pteval);
585 #else
586 	if (va < 1024 * 1024)
587 		pteval |= PT_NOCACHE;		/* for video RAM */
588 	if (pae_support)
589 		*ptep = pteval;
590 	else
591 		*((x86pte32_t *)ptep) = (x86pte32_t)pteval;
592 #endif
593 }
594 
595 /*
596  * Add a mapping for the physical page at the given virtual address.
597  */
598 static void
599 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
600 {
601 	map_ma_at_va(pa_to_ma(pa), va, level);
602 }
603 
604 /*
605  * This is called to remove start..end from the
606  * possible range of PCI addresses.
607  */
608 const uint64_t pci_lo_limit = 0x00100000ul;
609 const uint64_t pci_hi_limit = 0xfff00000ul;
610 static void
611 exclude_from_pci(uint64_t start, uint64_t end)
612 {
613 	int i;
614 	int j;
615 	struct boot_memlist *ml;
616 
617 	for (i = 0; i < pcimemlists_used; ++i) {
618 		ml = &pcimemlists[i];
619 
620 		/* delete the entire range? */
621 		if (start <= ml->addr && ml->addr + ml->size <= end) {
622 			--pcimemlists_used;
623 			for (j = i; j < pcimemlists_used; ++j)
624 				pcimemlists[j] = pcimemlists[j + 1];
625 			--i;	/* to revisit the new one at this index */
626 		}
627 
628 		/* split a range? */
629 		else if (ml->addr < start && end < ml->addr + ml->size) {
630 
631 			++pcimemlists_used;
632 			if (pcimemlists_used > MAX_MEMLIST)
633 				dboot_panic("too many pcimemlists");
634 
635 			for (j = pcimemlists_used - 1; j > i; --j)
636 				pcimemlists[j] = pcimemlists[j - 1];
637 			ml->size = start - ml->addr;
638 
639 			++ml;
640 			ml->size = (ml->addr + ml->size) - end;
641 			ml->addr = end;
642 			++i;	/* skip on to next one */
643 		}
644 
645 		/* cut memory off the start? */
646 		else if (ml->addr < end && end < ml->addr + ml->size) {
647 			ml->size -= end - ml->addr;
648 			ml->addr = end;
649 		}
650 
651 		/* cut memory off the end? */
652 		else if (ml->addr <= start && start < ml->addr + ml->size) {
653 			ml->size = start - ml->addr;
654 		}
655 	}
656 }
657 
658 /*
659  * During memory allocation, find the highest address not used yet.
660  */
661 static void
662 check_higher(paddr_t a)
663 {
664 	if (a < next_avail_addr)
665 		return;
666 	next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
667 	DBG(next_avail_addr);
668 }
669 
670 static int
671 dboot_loader_mmap_entries(void)
672 {
673 #if !defined(__xpv)
674 	if (num_entries_set == B_TRUE)
675 		return (num_entries);
676 
677 	switch (multiboot_version) {
678 	case 1:
679 		DBG(mb_info->flags);
680 		if (mb_info->flags & 0x40) {
681 			mb_memory_map_t *mmap;
682 
683 			DBG(mb_info->mmap_addr);
684 			DBG(mb_info->mmap_length);
685 			check_higher(mb_info->mmap_addr + mb_info->mmap_length);
686 
687 			for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
688 			    (uint32_t)mmap < mb_info->mmap_addr +
689 			    mb_info->mmap_length;
690 			    mmap = (mb_memory_map_t *)((uint32_t)mmap +
691 			    mmap->size + sizeof (mmap->size)))
692 				++num_entries;
693 
694 			num_entries_set = B_TRUE;
695 		}
696 		break;
697 	case 2:
698 		num_entries_set = B_TRUE;
699 		num_entries = dboot_multiboot2_mmap_nentries(mb2_info,
700 		    mb2_mmap_tagp);
701 		break;
702 	default:
703 		dboot_panic("Unknown multiboot version: %d\n",
704 		    multiboot_version);
705 		break;
706 	}
707 	return (num_entries);
708 #else
709 	return (MAXMAPS);
710 #endif
711 }
712 
713 static uint32_t
714 dboot_loader_mmap_get_type(int index)
715 {
716 #if !defined(__xpv)
717 	mb_memory_map_t *mp, *mpend;
718 	int i;
719 
720 	switch (multiboot_version) {
721 	case 1:
722 		mp = (mb_memory_map_t *)mb_info->mmap_addr;
723 		mpend = (mb_memory_map_t *)
724 		    (mb_info->mmap_addr + mb_info->mmap_length);
725 
726 		for (i = 0; mp < mpend && i != index; i++)
727 			mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
728 			    sizeof (mp->size));
729 		if (mp >= mpend) {
730 			dboot_panic("dboot_loader_mmap_get_type(): index "
731 			    "out of bounds: %d\n", index);
732 		}
733 		return (mp->type);
734 
735 	case 2:
736 		return (dboot_multiboot2_mmap_get_type(mb2_info,
737 		    mb2_mmap_tagp, index));
738 
739 	default:
740 		dboot_panic("Unknown multiboot version: %d\n",
741 		    multiboot_version);
742 		break;
743 	}
744 	return (0);
745 #else
746 	return (map_buffer[index].type);
747 #endif
748 }
749 
750 static uint64_t
751 dboot_loader_mmap_get_base(int index)
752 {
753 #if !defined(__xpv)
754 	mb_memory_map_t *mp, *mpend;
755 	int i;
756 
757 	switch (multiboot_version) {
758 	case 1:
759 		mp = (mb_memory_map_t *)mb_info->mmap_addr;
760 		mpend = (mb_memory_map_t *)
761 		    (mb_info->mmap_addr + mb_info->mmap_length);
762 
763 		for (i = 0; mp < mpend && i != index; i++)
764 			mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
765 			    sizeof (mp->size));
766 		if (mp >= mpend) {
767 			dboot_panic("dboot_loader_mmap_get_base(): index "
768 			    "out of bounds: %d\n", index);
769 		}
770 		return (((uint64_t)mp->base_addr_high << 32) +
771 		    (uint64_t)mp->base_addr_low);
772 
773 	case 2:
774 		return (dboot_multiboot2_mmap_get_base(mb2_info,
775 		    mb2_mmap_tagp, index));
776 
777 	default:
778 		dboot_panic("Unknown multiboot version: %d\n",
779 		    multiboot_version);
780 		break;
781 	}
782 	return (0);
783 #else
784 	return (((uint64_t)map_buffer[index].base_addr_high << 32) +
785 	    (uint64_t)map_buffer[index].base_addr_low);
786 #endif
787 }
788 
789 static uint64_t
790 dboot_loader_mmap_get_length(int index)
791 {
792 #if !defined(__xpv)
793 	mb_memory_map_t *mp, *mpend;
794 	int i;
795 
796 	switch (multiboot_version) {
797 	case 1:
798 		mp = (mb_memory_map_t *)mb_info->mmap_addr;
799 		mpend = (mb_memory_map_t *)
800 		    (mb_info->mmap_addr + mb_info->mmap_length);
801 
802 		for (i = 0; mp < mpend && i != index; i++)
803 			mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
804 			    sizeof (mp->size));
805 		if (mp >= mpend) {
806 			dboot_panic("dboot_loader_mmap_get_length(): index "
807 			    "out of bounds: %d\n", index);
808 		}
809 		return (((uint64_t)mp->length_high << 32) +
810 		    (uint64_t)mp->length_low);
811 
812 	case 2:
813 		return (dboot_multiboot2_mmap_get_length(mb2_info,
814 		    mb2_mmap_tagp, index));
815 
816 	default:
817 		dboot_panic("Unknown multiboot version: %d\n",
818 		    multiboot_version);
819 		break;
820 	}
821 	return (0);
822 #else
823 	return (((uint64_t)map_buffer[index].length_high << 32) +
824 	    (uint64_t)map_buffer[index].length_low);
825 #endif
826 }
827 
828 static void
829 build_pcimemlists(void)
830 {
831 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
832 	uint64_t start;
833 	uint64_t end;
834 	int i, num;
835 
836 	/*
837 	 * initialize
838 	 */
839 	pcimemlists[0].addr = pci_lo_limit;
840 	pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
841 	pcimemlists_used = 1;
842 
843 	num = dboot_loader_mmap_entries();
844 	/*
845 	 * Fill in PCI memlists.
846 	 */
847 	for (i = 0; i < num; ++i) {
848 		start = dboot_loader_mmap_get_base(i);
849 		end = start + dboot_loader_mmap_get_length(i);
850 
851 		if (prom_debug)
852 			dboot_printf("\ttype: %d %" PRIx64 "..%"
853 			    PRIx64 "\n", dboot_loader_mmap_get_type(i),
854 			    start, end);
855 
856 		/*
857 		 * page align start and end
858 		 */
859 		start = (start + page_offset) & ~page_offset;
860 		end &= ~page_offset;
861 		if (end <= start)
862 			continue;
863 
864 		exclude_from_pci(start, end);
865 	}
866 
867 	/*
868 	 * Finish off the pcimemlist
869 	 */
870 	if (prom_debug) {
871 		for (i = 0; i < pcimemlists_used; ++i) {
872 			dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
873 			    PRIx64 "\n", pcimemlists[i].addr,
874 			    pcimemlists[i].addr + pcimemlists[i].size);
875 		}
876 	}
877 	pcimemlists[0].next = 0;
878 	pcimemlists[0].prev = 0;
879 	for (i = 1; i < pcimemlists_used; ++i) {
880 		pcimemlists[i].prev =
881 		    (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
882 		pcimemlists[i].next = 0;
883 		pcimemlists[i - 1].next =
884 		    (native_ptr_t)(uintptr_t)(pcimemlists + i);
885 	}
886 	bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
887 	DBG(bi->bi_pcimem);
888 }
889 
890 #if defined(__xpv)
891 /*
892  * Initialize memory allocator stuff from hypervisor-supplied start info.
893  */
894 static void
895 init_mem_alloc(void)
896 {
897 	int	local;	/* variables needed to find start region */
898 	paddr_t	scratch_start;
899 	xen_memory_map_t map;
900 
901 	DBG_MSG("Entered init_mem_alloc()\n");
902 
903 	/*
904 	 * Free memory follows the stack. There's at least 512KB of scratch
905 	 * space, rounded up to at least 2Mb alignment.  That should be enough
906 	 * for the page tables we'll need to build.  The nucleus memory is
907 	 * allocated last and will be outside the addressible range.  We'll
908 	 * switch to new page tables before we unpack the kernel
909 	 */
910 	scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
911 	DBG(scratch_start);
912 	scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
913 	DBG(scratch_end);
914 
915 	/*
916 	 * For paranoia, leave some space between hypervisor data and ours.
917 	 * Use 500 instead of 512.
918 	 */
919 	next_avail_addr = scratch_end - 500 * 1024;
920 	DBG(next_avail_addr);
921 
922 	/*
923 	 * The domain builder gives us at most 1 module
924 	 */
925 	DBG(xen_info->mod_len);
926 	if (xen_info->mod_len > 0) {
927 		DBG(xen_info->mod_start);
928 		modules[0].bm_addr =
929 		    (native_ptr_t)(uintptr_t)xen_info->mod_start;
930 		modules[0].bm_size = xen_info->mod_len;
931 		bi->bi_module_cnt = 1;
932 		bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
933 	} else {
934 		bi->bi_module_cnt = 0;
935 		bi->bi_modules = (native_ptr_t)(uintptr_t)NULL;
936 	}
937 	DBG(bi->bi_module_cnt);
938 	DBG(bi->bi_modules);
939 
940 	DBG(xen_info->mfn_list);
941 	DBG(xen_info->nr_pages);
942 	max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
943 	DBG(max_mem);
944 
945 	/*
946 	 * Using pseudo-physical addresses, so only 1 memlist element
947 	 */
948 	memlists[0].addr = 0;
949 	DBG(memlists[0].addr);
950 	memlists[0].size = max_mem;
951 	DBG(memlists[0].size);
952 	memlists_used = 1;
953 	DBG(memlists_used);
954 
955 	/*
956 	 * finish building physinstall list
957 	 */
958 	sort_physinstall();
959 
960 	/*
961 	 * build bios reserved memlists
962 	 */
963 	build_rsvdmemlists();
964 
965 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
966 		/*
967 		 * build PCI Memory list
968 		 */
969 		map.nr_entries = MAXMAPS;
970 		/*LINTED: constant in conditional context*/
971 		set_xen_guest_handle(map.buffer, map_buffer);
972 		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
973 			dboot_panic("getting XENMEM_machine_memory_map failed");
974 		build_pcimemlists();
975 	}
976 }
977 
978 #else	/* !__xpv */
979 
980 static void
981 dboot_multiboot1_xboot_consinfo(void)
982 {
983 	bi->bi_framebuffer = NULL;
984 }
985 
986 static void
987 dboot_multiboot2_xboot_consinfo(void)
988 {
989 	multiboot_tag_framebuffer_t *fb;
990 	fb = dboot_multiboot2_find_tag(mb2_info,
991 	    MULTIBOOT_TAG_TYPE_FRAMEBUFFER);
992 	bi->bi_framebuffer = (native_ptr_t)(uintptr_t)fb;
993 }
994 
995 static int
996 dboot_multiboot_modcount(void)
997 {
998 	switch (multiboot_version) {
999 	case 1:
1000 		return (mb_info->mods_count);
1001 
1002 	case 2:
1003 		return (dboot_multiboot2_modcount(mb2_info));
1004 
1005 	default:
1006 		dboot_panic("Unknown multiboot version: %d\n",
1007 		    multiboot_version);
1008 		break;
1009 	}
1010 	return (0);
1011 }
1012 
1013 static uint32_t
1014 dboot_multiboot_modstart(int index)
1015 {
1016 	switch (multiboot_version) {
1017 	case 1:
1018 		return (((mb_module_t *)mb_info->mods_addr)[index].mod_start);
1019 
1020 	case 2:
1021 		return (dboot_multiboot2_modstart(mb2_info, index));
1022 
1023 	default:
1024 		dboot_panic("Unknown multiboot version: %d\n",
1025 		    multiboot_version);
1026 		break;
1027 	}
1028 	return (0);
1029 }
1030 
1031 static uint32_t
1032 dboot_multiboot_modend(int index)
1033 {
1034 	switch (multiboot_version) {
1035 	case 1:
1036 		return (((mb_module_t *)mb_info->mods_addr)[index].mod_end);
1037 
1038 	case 2:
1039 		return (dboot_multiboot2_modend(mb2_info, index));
1040 
1041 	default:
1042 		dboot_panic("Unknown multiboot version: %d\n",
1043 		    multiboot_version);
1044 		break;
1045 	}
1046 	return (0);
1047 }
1048 
1049 static char *
1050 dboot_multiboot_modcmdline(int index)
1051 {
1052 	switch (multiboot_version) {
1053 	case 1:
1054 		return ((char *)((mb_module_t *)
1055 		    mb_info->mods_addr)[index].mod_name);
1056 
1057 	case 2:
1058 		return (dboot_multiboot2_modcmdline(mb2_info, index));
1059 
1060 	default:
1061 		dboot_panic("Unknown multiboot version: %d\n",
1062 		    multiboot_version);
1063 		break;
1064 	}
1065 	return (0);
1066 }
1067 
1068 /*
1069  * Find the environment module for console setup.
1070  * Since we need the console to print early boot messages, the console is set up
1071  * before anything else and therefore we need to pick up the environment module
1072  * early too.
1073  *
1074  * Note, we just will search for and if found, will pass the env
1075  * module to console setup, the proper module list processing will happen later.
1076  */
1077 static void
1078 dboot_find_env(void)
1079 {
1080 	int i, modcount;
1081 	uint32_t mod_start, mod_end;
1082 	char *cmdline;
1083 
1084 	modcount = dboot_multiboot_modcount();
1085 
1086 	for (i = 0; i < modcount; ++i) {
1087 		cmdline = dboot_multiboot_modcmdline(i);
1088 		if (cmdline == NULL)
1089 			continue;
1090 
1091 		if (strstr(cmdline, "type=environment") == NULL)
1092 			continue;
1093 
1094 		mod_start = dboot_multiboot_modstart(i);
1095 		mod_end = dboot_multiboot_modend(i);
1096 		modules[0].bm_addr = (native_ptr_t)(uintptr_t)mod_start;
1097 		modules[0].bm_size = mod_end - mod_start;
1098 		modules[0].bm_name = (native_ptr_t)(uintptr_t)NULL;
1099 		modules[0].bm_hash = (native_ptr_t)(uintptr_t)NULL;
1100 		modules[0].bm_type = BMT_ENV;
1101 		bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1102 		bi->bi_module_cnt = 1;
1103 		return;
1104 	}
1105 }
1106 
1107 static boolean_t
1108 dboot_multiboot_basicmeminfo(uint32_t *lower, uint32_t *upper)
1109 {
1110 	boolean_t rv = B_FALSE;
1111 
1112 	switch (multiboot_version) {
1113 	case 1:
1114 		if (mb_info->flags & 0x01) {
1115 			*lower = mb_info->mem_lower;
1116 			*upper = mb_info->mem_upper;
1117 			rv = B_TRUE;
1118 		}
1119 		break;
1120 
1121 	case 2:
1122 		return (dboot_multiboot2_basicmeminfo(mb2_info, lower, upper));
1123 
1124 	default:
1125 		dboot_panic("Unknown multiboot version: %d\n",
1126 		    multiboot_version);
1127 		break;
1128 	}
1129 	return (rv);
1130 }
1131 
1132 static uint8_t
1133 dboot_a2h(char v)
1134 {
1135 	if (v >= 'a')
1136 		return (v - 'a' + 0xa);
1137 	else if (v >= 'A')
1138 		return (v - 'A' + 0xa);
1139 	else if (v >= '0')
1140 		return (v - '0');
1141 	else
1142 		dboot_panic("bad ASCII hex character %c\n", v);
1143 
1144 	return (0);
1145 }
1146 
1147 static void
1148 digest_a2h(const char *ascii, uint8_t *digest)
1149 {
1150 	unsigned int i;
1151 
1152 	for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1153 		digest[i] = dboot_a2h(ascii[i * 2]) << 4;
1154 		digest[i] |= dboot_a2h(ascii[i * 2 + 1]);
1155 	}
1156 }
1157 
1158 /*
1159  * Generate a SHA-1 hash of the first len bytes of image, and compare it with
1160  * the ASCII-format hash found in the 40-byte buffer at ascii.  If they
1161  * match, return 0, otherwise -1.  This works only for images smaller than
1162  * 4 GB, which should not be a problem.
1163  */
1164 static int
1165 check_image_hash(uint_t midx)
1166 {
1167 	const char *ascii;
1168 	const void *image;
1169 	size_t len;
1170 	SHA1_CTX ctx;
1171 	uint8_t digest[SHA1_DIGEST_LENGTH];
1172 	uint8_t baseline[SHA1_DIGEST_LENGTH];
1173 	unsigned int i;
1174 
1175 	ascii = (const char *)(uintptr_t)modules[midx].bm_hash;
1176 	image = (const void *)(uintptr_t)modules[midx].bm_addr;
1177 	len = (size_t)modules[midx].bm_size;
1178 
1179 	digest_a2h(ascii, baseline);
1180 
1181 	SHA1Init(&ctx);
1182 	SHA1Update(&ctx, image, len);
1183 	SHA1Final(digest, &ctx);
1184 
1185 	for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1186 		if (digest[i] != baseline[i])
1187 			return (-1);
1188 	}
1189 
1190 	return (0);
1191 }
1192 
1193 static const char *
1194 type_to_str(boot_module_type_t type)
1195 {
1196 	switch (type) {
1197 	case BMT_ROOTFS:
1198 		return ("rootfs");
1199 	case BMT_FILE:
1200 		return ("file");
1201 	case BMT_HASH:
1202 		return ("hash");
1203 	case BMT_ENV:
1204 		return ("environment");
1205 	default:
1206 		return ("unknown");
1207 	}
1208 }
1209 
1210 static void
1211 check_images(void)
1212 {
1213 	uint_t i;
1214 	char displayhash[SHA1_ASCII_LENGTH + 1];
1215 
1216 	for (i = 0; i < modules_used; i++) {
1217 		if (prom_debug) {
1218 			dboot_printf("module #%d: name %s type %s "
1219 			    "addr %lx size %lx\n",
1220 			    i, (char *)(uintptr_t)modules[i].bm_name,
1221 			    type_to_str(modules[i].bm_type),
1222 			    (ulong_t)modules[i].bm_addr,
1223 			    (ulong_t)modules[i].bm_size);
1224 		}
1225 
1226 		if (modules[i].bm_type == BMT_HASH ||
1227 		    modules[i].bm_hash == (native_ptr_t)(uintptr_t)NULL) {
1228 			DBG_MSG("module has no hash; skipping check\n");
1229 			continue;
1230 		}
1231 		(void) memcpy(displayhash,
1232 		    (void *)(uintptr_t)modules[i].bm_hash,
1233 		    SHA1_ASCII_LENGTH);
1234 		displayhash[SHA1_ASCII_LENGTH] = '\0';
1235 		if (prom_debug) {
1236 			dboot_printf("checking expected hash [%s]: ",
1237 			    displayhash);
1238 		}
1239 
1240 		if (check_image_hash(i) != 0)
1241 			dboot_panic("hash mismatch!\n");
1242 		else
1243 			DBG_MSG("OK\n");
1244 	}
1245 }
1246 
1247 /*
1248  * Determine the module's starting address, size, name, and type, and fill the
1249  * boot_modules structure.  This structure is used by the bop code, except for
1250  * hashes which are checked prior to transferring control to the kernel.
1251  */
1252 static void
1253 process_module(int midx)
1254 {
1255 	uint32_t mod_start = dboot_multiboot_modstart(midx);
1256 	uint32_t mod_end = dboot_multiboot_modend(midx);
1257 	char *cmdline = dboot_multiboot_modcmdline(midx);
1258 	char *p, *q;
1259 
1260 	check_higher(mod_end);
1261 	if (prom_debug) {
1262 		dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n",
1263 		    midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end);
1264 	}
1265 
1266 	if (mod_start > mod_end) {
1267 		dboot_panic("module #%d: module start address 0x%lx greater "
1268 		    "than end address 0x%lx", midx,
1269 		    (ulong_t)mod_start, (ulong_t)mod_end);
1270 	}
1271 
1272 	/*
1273 	 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
1274 	 * the address of the last valid byte in a module plus 1 as mod_end.
1275 	 * This is of course a bug; the multiboot specification simply states
1276 	 * that mod_start and mod_end "contain the start and end addresses of
1277 	 * the boot module itself" which is pretty obviously not what GRUB is
1278 	 * doing.  However, fixing it requires that not only this code be
1279 	 * changed but also that other code consuming this value and values
1280 	 * derived from it be fixed, and that the kernel and GRUB must either
1281 	 * both have the bug or neither.  While there are a lot of combinations
1282 	 * that will work, there are also some that won't, so for simplicity
1283 	 * we'll just cope with the bug.  That means we won't actually hash the
1284 	 * byte at mod_end, and we will expect that mod_end for the hash file
1285 	 * itself is one greater than some multiple of 41 (40 bytes of ASCII
1286 	 * hash plus a newline for each module).  We set bm_size to the true
1287 	 * correct number of bytes in each module, achieving exactly this.
1288 	 */
1289 
1290 	modules[midx].bm_addr = (native_ptr_t)(uintptr_t)mod_start;
1291 	modules[midx].bm_size = mod_end - mod_start;
1292 	modules[midx].bm_name = (native_ptr_t)(uintptr_t)cmdline;
1293 	modules[midx].bm_hash = (native_ptr_t)(uintptr_t)NULL;
1294 	modules[midx].bm_type = BMT_FILE;
1295 
1296 	if (cmdline == NULL) {
1297 		modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname;
1298 		return;
1299 	}
1300 
1301 	p = cmdline;
1302 	modules[midx].bm_name =
1303 	    (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r");
1304 
1305 	while (p != NULL) {
1306 		q = strsep(&p, " \t\f\n\r");
1307 		if (strncmp(q, "name=", 5) == 0) {
1308 			if (q[5] != '\0' && !isspace(q[5])) {
1309 				modules[midx].bm_name =
1310 				    (native_ptr_t)(uintptr_t)(q + 5);
1311 			}
1312 			continue;
1313 		}
1314 
1315 		if (strncmp(q, "type=", 5) == 0) {
1316 			if (q[5] == '\0' || isspace(q[5]))
1317 				continue;
1318 			q += 5;
1319 			if (strcmp(q, "rootfs") == 0) {
1320 				modules[midx].bm_type = BMT_ROOTFS;
1321 			} else if (strcmp(q, "hash") == 0) {
1322 				modules[midx].bm_type = BMT_HASH;
1323 			} else if (strcmp(q, "environment") == 0) {
1324 				modules[midx].bm_type = BMT_ENV;
1325 			} else if (strcmp(q, "file") != 0) {
1326 				dboot_printf("\tmodule #%d: unknown module "
1327 				    "type '%s'; defaulting to 'file'\n",
1328 				    midx, q);
1329 			}
1330 			continue;
1331 		}
1332 
1333 		if (strncmp(q, "hash=", 5) == 0) {
1334 			if (q[5] != '\0' && !isspace(q[5])) {
1335 				modules[midx].bm_hash =
1336 				    (native_ptr_t)(uintptr_t)(q + 5);
1337 			}
1338 			continue;
1339 		}
1340 
1341 		dboot_printf("ignoring unknown option '%s'\n", q);
1342 	}
1343 }
1344 
1345 /*
1346  * Backward compatibility: if there are exactly one or two modules, both
1347  * of type 'file' and neither with an embedded hash value, we have been
1348  * given the legacy style modules.  In this case we need to treat the first
1349  * module as a rootfs and the second as a hash referencing that module.
1350  * Otherwise, even if the configuration is invalid, we assume that the
1351  * operator knows what he's doing or at least isn't being bitten by this
1352  * interface change.
1353  */
1354 static void
1355 fixup_modules(void)
1356 {
1357 	if (modules_used == 0 || modules_used > 2)
1358 		return;
1359 
1360 	if (modules[0].bm_type != BMT_FILE ||
1361 	    modules_used > 1 && modules[1].bm_type != BMT_FILE) {
1362 		return;
1363 	}
1364 
1365 	if (modules[0].bm_hash != (native_ptr_t)(uintptr_t)NULL ||
1366 	    modules_used > 1 &&
1367 	    modules[1].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1368 		return;
1369 	}
1370 
1371 	modules[0].bm_type = BMT_ROOTFS;
1372 	if (modules_used > 1) {
1373 		modules[1].bm_type = BMT_HASH;
1374 		modules[1].bm_name = modules[0].bm_name;
1375 	}
1376 }
1377 
1378 /*
1379  * For modules that do not have assigned hashes but have a separate hash module,
1380  * find the assigned hash module and set the primary module's bm_hash to point
1381  * to the hash data from that module.  We will then ignore modules of type
1382  * BMT_HASH from this point forward.
1383  */
1384 static void
1385 assign_module_hashes(void)
1386 {
1387 	uint_t i, j;
1388 
1389 	for (i = 0; i < modules_used; i++) {
1390 		if (modules[i].bm_type == BMT_HASH ||
1391 		    modules[i].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1392 			continue;
1393 		}
1394 
1395 		for (j = 0; j < modules_used; j++) {
1396 			if (modules[j].bm_type != BMT_HASH ||
1397 			    strcmp((char *)(uintptr_t)modules[j].bm_name,
1398 			    (char *)(uintptr_t)modules[i].bm_name) != 0) {
1399 				continue;
1400 			}
1401 
1402 			if (modules[j].bm_size < SHA1_ASCII_LENGTH) {
1403 				dboot_printf("Short hash module of length "
1404 				    "0x%lx bytes; ignoring\n",
1405 				    (ulong_t)modules[j].bm_size);
1406 			} else {
1407 				modules[i].bm_hash = modules[j].bm_addr;
1408 			}
1409 			break;
1410 		}
1411 	}
1412 }
1413 
1414 /*
1415  * Walk through the module information finding the last used address.
1416  * The first available address will become the top level page table.
1417  */
1418 static void
1419 dboot_process_modules(void)
1420 {
1421 	int i, modcount;
1422 	extern char _end[];
1423 
1424 	DBG_MSG("\nFinding Modules\n");
1425 	modcount = dboot_multiboot_modcount();
1426 	if (modcount > MAX_BOOT_MODULES) {
1427 		dboot_panic("Too many modules (%d) -- the maximum is %d.",
1428 		    modcount, MAX_BOOT_MODULES);
1429 	}
1430 	/*
1431 	 * search the modules to find the last used address
1432 	 * we'll build the module list while we're walking through here
1433 	 */
1434 	check_higher((paddr_t)(uintptr_t)&_end);
1435 	for (i = 0; i < modcount; ++i) {
1436 		process_module(i);
1437 		modules_used++;
1438 	}
1439 	bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1440 	DBG(bi->bi_modules);
1441 	bi->bi_module_cnt = modcount;
1442 	DBG(bi->bi_module_cnt);
1443 
1444 	fixup_modules();
1445 	assign_module_hashes();
1446 	check_images();
1447 }
1448 
1449 /*
1450  * We then build the phys_install memlist from the multiboot information.
1451  */
1452 static void
1453 dboot_process_mmap(void)
1454 {
1455 	uint64_t start;
1456 	uint64_t end;
1457 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
1458 	uint32_t lower, upper;
1459 	int i, mmap_entries;
1460 
1461 	/*
1462 	 * Walk through the memory map from multiboot and build our memlist
1463 	 * structures. Note these will have native format pointers.
1464 	 */
1465 	DBG_MSG("\nFinding Memory Map\n");
1466 	num_entries = 0;
1467 	num_entries_set = B_FALSE;
1468 	max_mem = 0;
1469 	if ((mmap_entries = dboot_loader_mmap_entries()) > 0) {
1470 		for (i = 0; i < mmap_entries; i++) {
1471 			uint32_t type = dboot_loader_mmap_get_type(i);
1472 			start = dboot_loader_mmap_get_base(i);
1473 			end = start + dboot_loader_mmap_get_length(i);
1474 
1475 			if (prom_debug)
1476 				dboot_printf("\ttype: %d %" PRIx64 "..%"
1477 				    PRIx64 "\n", type, start, end);
1478 
1479 			/*
1480 			 * page align start and end
1481 			 */
1482 			start = (start + page_offset) & ~page_offset;
1483 			end &= ~page_offset;
1484 			if (end <= start)
1485 				continue;
1486 
1487 			/*
1488 			 * only type 1 is usable RAM
1489 			 */
1490 			switch (type) {
1491 			case 1:
1492 				if (end > max_mem)
1493 					max_mem = end;
1494 				memlists[memlists_used].addr = start;
1495 				memlists[memlists_used].size = end - start;
1496 				++memlists_used;
1497 				if (memlists_used > MAX_MEMLIST)
1498 					dboot_panic("too many memlists");
1499 				break;
1500 			case 2:
1501 				rsvdmemlists[rsvdmemlists_used].addr = start;
1502 				rsvdmemlists[rsvdmemlists_used].size =
1503 				    end - start;
1504 				++rsvdmemlists_used;
1505 				if (rsvdmemlists_used > MAX_MEMLIST)
1506 					dboot_panic("too many rsvdmemlists");
1507 				break;
1508 			default:
1509 				continue;
1510 			}
1511 		}
1512 		build_pcimemlists();
1513 	} else if (dboot_multiboot_basicmeminfo(&lower, &upper)) {
1514 		DBG(lower);
1515 		memlists[memlists_used].addr = 0;
1516 		memlists[memlists_used].size = lower * 1024;
1517 		++memlists_used;
1518 		DBG(upper);
1519 		memlists[memlists_used].addr = 1024 * 1024;
1520 		memlists[memlists_used].size = upper * 1024;
1521 		++memlists_used;
1522 
1523 		/*
1524 		 * Old platform - assume I/O space at the end of memory.
1525 		 */
1526 		pcimemlists[0].addr = (upper * 1024) + (1024 * 1024);
1527 		pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
1528 		pcimemlists[0].next = 0;
1529 		pcimemlists[0].prev = 0;
1530 		bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
1531 		DBG(bi->bi_pcimem);
1532 	} else {
1533 		dboot_panic("No memory info from boot loader!!!");
1534 	}
1535 
1536 	/*
1537 	 * finish processing the physinstall list
1538 	 */
1539 	sort_physinstall();
1540 
1541 	/*
1542 	 * build bios reserved mem lists
1543 	 */
1544 	build_rsvdmemlists();
1545 }
1546 
1547 /*
1548  * The highest address is used as the starting point for dboot's simple
1549  * memory allocator.
1550  *
1551  * Finding the highest address in case of Multiboot 1 protocol is
1552  * quite painful in the sense that some information provided by
1553  * the multiboot info structure points to BIOS data, and some to RAM.
1554  *
1555  * The module list was processed and checked already by dboot_process_modules(),
1556  * so we will check the command line string and the memory map.
1557  *
1558  * This list of to be checked items is based on our current knowledge of
1559  * allocations made by grub1 and will need to be reviewed if there
1560  * are updates about the information provided by Multiboot 1.
1561  *
1562  * In the case of the Multiboot 2, our life is much simpler, as the MB2
1563  * information tag list is one contiguous chunk of memory.
1564  */
1565 static paddr_t
1566 dboot_multiboot1_highest_addr(void)
1567 {
1568 	paddr_t addr = (paddr_t)(uintptr_t)NULL;
1569 	char *cmdl = (char *)mb_info->cmdline;
1570 
1571 	if (mb_info->flags & MB_INFO_CMDLINE)
1572 		addr = ((paddr_t)((uintptr_t)cmdl + strlen(cmdl) + 1));
1573 
1574 	if (mb_info->flags & MB_INFO_MEM_MAP)
1575 		addr = MAX(addr,
1576 		    ((paddr_t)(mb_info->mmap_addr + mb_info->mmap_length)));
1577 	return (addr);
1578 }
1579 
1580 static void
1581 dboot_multiboot_highest_addr(void)
1582 {
1583 	paddr_t addr;
1584 
1585 	switch (multiboot_version) {
1586 	case 1:
1587 		addr = dboot_multiboot1_highest_addr();
1588 		if (addr != (paddr_t)(uintptr_t)NULL)
1589 			check_higher(addr);
1590 		break;
1591 	case 2:
1592 		addr = dboot_multiboot2_highest_addr(mb2_info);
1593 		if (addr != (paddr_t)(uintptr_t)NULL)
1594 			check_higher(addr);
1595 		break;
1596 	default:
1597 		dboot_panic("Unknown multiboot version: %d\n",
1598 		    multiboot_version);
1599 		break;
1600 	}
1601 }
1602 
1603 /*
1604  * Walk the boot loader provided information and find the highest free address.
1605  */
1606 static void
1607 init_mem_alloc(void)
1608 {
1609 	DBG_MSG("Entered init_mem_alloc()\n");
1610 	dboot_process_modules();
1611 	dboot_process_mmap();
1612 	dboot_multiboot_highest_addr();
1613 }
1614 
1615 static int
1616 dboot_same_guids(efi_guid_t *g1, efi_guid_t *g2)
1617 {
1618 	int i;
1619 
1620 	if (g1->time_low != g2->time_low)
1621 		return (0);
1622 	if (g1->time_mid != g2->time_mid)
1623 		return (0);
1624 	if (g1->time_hi_and_version != g2->time_hi_and_version)
1625 		return (0);
1626 	if (g1->clock_seq_hi_and_reserved != g2->clock_seq_hi_and_reserved)
1627 		return (0);
1628 	if (g1->clock_seq_low != g2->clock_seq_low)
1629 		return (0);
1630 
1631 	for (i = 0; i < 6; i++) {
1632 		if (g1->node_addr[i] != g2->node_addr[i])
1633 			return (0);
1634 	}
1635 	return (1);
1636 }
1637 
1638 static void
1639 process_efi32(EFI_SYSTEM_TABLE32 *efi)
1640 {
1641 	uint32_t entries;
1642 	EFI_CONFIGURATION_TABLE32 *config;
1643 	int i;
1644 
1645 	entries = efi->NumberOfTableEntries;
1646 	config = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1647 	    efi->ConfigurationTable;
1648 
1649 	for (i = 0; i < entries; i++) {
1650 		if (dboot_same_guids(&config[i].VendorGuid, &smbios3)) {
1651 			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1652 			    config[i].VendorTable;
1653 		}
1654 		if (bi->bi_smbios == NULL &&
1655 		    dboot_same_guids(&config[i].VendorGuid, &smbios)) {
1656 			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1657 			    config[i].VendorTable;
1658 		}
1659 		if (dboot_same_guids(&config[i].VendorGuid, &acpi2)) {
1660 			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1661 			    config[i].VendorTable;
1662 		}
1663 		if (bi->bi_acpi_rsdp == NULL &&
1664 		    dboot_same_guids(&config[i].VendorGuid, &acpi1)) {
1665 			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1666 			    config[i].VendorTable;
1667 		}
1668 	}
1669 }
1670 
1671 static void
1672 process_efi64(EFI_SYSTEM_TABLE64 *efi)
1673 {
1674 	uint64_t entries;
1675 	EFI_CONFIGURATION_TABLE64 *config;
1676 	int i;
1677 
1678 	entries = efi->NumberOfTableEntries;
1679 	config = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1680 	    efi->ConfigurationTable;
1681 
1682 	for (i = 0; i < entries; i++) {
1683 		if (dboot_same_guids(&config[i].VendorGuid, &smbios3)) {
1684 			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1685 			    config[i].VendorTable;
1686 		}
1687 		if (bi->bi_smbios == NULL &&
1688 		    dboot_same_guids(&config[i].VendorGuid, &smbios)) {
1689 			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1690 			    config[i].VendorTable;
1691 		}
1692 		/* Prefer acpi v2+ over v1. */
1693 		if (dboot_same_guids(&config[i].VendorGuid, &acpi2)) {
1694 			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1695 			    config[i].VendorTable;
1696 		}
1697 		if (bi->bi_acpi_rsdp == NULL &&
1698 		    dboot_same_guids(&config[i].VendorGuid, &acpi1)) {
1699 			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1700 			    config[i].VendorTable;
1701 		}
1702 	}
1703 }
1704 
1705 static void
1706 dboot_multiboot_get_fwtables(void)
1707 {
1708 	multiboot_tag_new_acpi_t *nacpitagp;
1709 	multiboot_tag_old_acpi_t *oacpitagp;
1710 	multiboot_tag_efi64_t *efi64tagp = NULL;
1711 	multiboot_tag_efi32_t *efi32tagp = NULL;
1712 
1713 	/* no fw tables from multiboot 1 */
1714 	if (multiboot_version != 2)
1715 		return;
1716 
1717 	efi64tagp = (multiboot_tag_efi64_t *)
1718 	    dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_EFI64);
1719 	if (efi64tagp != NULL) {
1720 		bi->bi_uefi_arch = XBI_UEFI_ARCH_64;
1721 		bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1722 		    efi64tagp->mb_pointer;
1723 		process_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
1724 		    efi64tagp->mb_pointer);
1725 	} else {
1726 		efi32tagp = (multiboot_tag_efi32_t *)
1727 		    dboot_multiboot2_find_tag(mb2_info,
1728 		    MULTIBOOT_TAG_TYPE_EFI32);
1729 		if (efi32tagp != NULL) {
1730 			bi->bi_uefi_arch = XBI_UEFI_ARCH_32;
1731 			bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1732 			    efi32tagp->mb_pointer;
1733 			process_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
1734 			    efi32tagp->mb_pointer);
1735 		}
1736 	}
1737 
1738 	/*
1739 	 * The ACPI RSDP can be found by scanning the BIOS memory areas or
1740 	 * from the EFI system table. The boot loader may pass in the address
1741 	 * it found the ACPI tables at.
1742 	 */
1743 	nacpitagp = (multiboot_tag_new_acpi_t *)
1744 	    dboot_multiboot2_find_tag(mb2_info,
1745 	    MULTIBOOT_TAG_TYPE_ACPI_NEW);
1746 	oacpitagp = (multiboot_tag_old_acpi_t *)
1747 	    dboot_multiboot2_find_tag(mb2_info,
1748 	    MULTIBOOT_TAG_TYPE_ACPI_OLD);
1749 
1750 	if (nacpitagp != NULL) {
1751 		bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1752 		    &nacpitagp->mb_rsdp[0];
1753 	} else if (oacpitagp != NULL) {
1754 		bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1755 		    &oacpitagp->mb_rsdp[0];
1756 	}
1757 }
1758 
1759 /* print out EFI version string with newline */
1760 static void
1761 dboot_print_efi_version(uint32_t ver)
1762 {
1763 	int rev;
1764 
1765 	dboot_printf("%d.", EFI_REV_MAJOR(ver));
1766 
1767 	rev = EFI_REV_MINOR(ver);
1768 	if ((rev % 10) != 0) {
1769 		dboot_printf("%d.%d\n", rev / 10, rev % 10);
1770 	} else {
1771 		dboot_printf("%d\n", rev / 10);
1772 	}
1773 }
1774 
1775 static void
1776 print_efi32(EFI_SYSTEM_TABLE32 *efi)
1777 {
1778 	uint16_t *data;
1779 	EFI_CONFIGURATION_TABLE32 *conf;
1780 	int i;
1781 
1782 	dboot_printf("EFI32 signature: %llx\n",
1783 	    (unsigned long long)efi->Hdr.Signature);
1784 	dboot_printf("EFI system version: ");
1785 	dboot_print_efi_version(efi->Hdr.Revision);
1786 	dboot_printf("EFI system vendor: ");
1787 	data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1788 	for (i = 0; data[i] != 0; i++)
1789 		dboot_printf("%c", (char)data[i]);
1790 	dboot_printf("\nEFI firmware revision: ");
1791 	dboot_print_efi_version(efi->FirmwareRevision);
1792 	dboot_printf("EFI system table number of entries: %d\n",
1793 	    efi->NumberOfTableEntries);
1794 	conf = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1795 	    efi->ConfigurationTable;
1796 	for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1797 		dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1798 		    conf[i].VendorGuid.time_low,
1799 		    conf[i].VendorGuid.time_mid,
1800 		    conf[i].VendorGuid.time_hi_and_version,
1801 		    conf[i].VendorGuid.clock_seq_hi_and_reserved,
1802 		    conf[i].VendorGuid.clock_seq_low);
1803 		dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1804 		    conf[i].VendorGuid.node_addr[0],
1805 		    conf[i].VendorGuid.node_addr[1],
1806 		    conf[i].VendorGuid.node_addr[2],
1807 		    conf[i].VendorGuid.node_addr[3],
1808 		    conf[i].VendorGuid.node_addr[4],
1809 		    conf[i].VendorGuid.node_addr[5]);
1810 	}
1811 }
1812 
1813 static void
1814 print_efi64(EFI_SYSTEM_TABLE64 *efi)
1815 {
1816 	uint16_t *data;
1817 	EFI_CONFIGURATION_TABLE64 *conf;
1818 	int i;
1819 
1820 	dboot_printf("EFI64 signature: %llx\n",
1821 	    (unsigned long long)efi->Hdr.Signature);
1822 	dboot_printf("EFI system version: ");
1823 	dboot_print_efi_version(efi->Hdr.Revision);
1824 	dboot_printf("EFI system vendor: ");
1825 	data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1826 	for (i = 0; data[i] != 0; i++)
1827 		dboot_printf("%c", (char)data[i]);
1828 	dboot_printf("\nEFI firmware revision: ");
1829 	dboot_print_efi_version(efi->FirmwareRevision);
1830 	dboot_printf("EFI system table number of entries: %lld\n",
1831 	    efi->NumberOfTableEntries);
1832 	conf = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1833 	    efi->ConfigurationTable;
1834 	for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1835 		dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1836 		    conf[i].VendorGuid.time_low,
1837 		    conf[i].VendorGuid.time_mid,
1838 		    conf[i].VendorGuid.time_hi_and_version,
1839 		    conf[i].VendorGuid.clock_seq_hi_and_reserved,
1840 		    conf[i].VendorGuid.clock_seq_low);
1841 		dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1842 		    conf[i].VendorGuid.node_addr[0],
1843 		    conf[i].VendorGuid.node_addr[1],
1844 		    conf[i].VendorGuid.node_addr[2],
1845 		    conf[i].VendorGuid.node_addr[3],
1846 		    conf[i].VendorGuid.node_addr[4],
1847 		    conf[i].VendorGuid.node_addr[5]);
1848 	}
1849 }
1850 #endif /* !__xpv */
1851 
1852 /*
1853  * Simple memory allocator, allocates aligned physical memory.
1854  * Note that startup_kernel() only allocates memory, never frees.
1855  * Memory usage just grows in an upward direction.
1856  */
1857 static void *
1858 do_mem_alloc(uint32_t size, uint32_t align)
1859 {
1860 	uint_t i;
1861 	uint64_t best;
1862 	uint64_t start;
1863 	uint64_t end;
1864 
1865 	/*
1866 	 * make sure size is a multiple of pagesize
1867 	 */
1868 	size = RNDUP(size, MMU_PAGESIZE);
1869 	next_avail_addr = RNDUP(next_avail_addr, align);
1870 
1871 	/*
1872 	 * XXPV fixme joe
1873 	 *
1874 	 * a really large bootarchive that causes you to run out of memory
1875 	 * may cause this to blow up
1876 	 */
1877 	/* LINTED E_UNEXPECTED_UINT_PROMOTION */
1878 	best = (uint64_t)-size;
1879 	for (i = 0; i < memlists_used; ++i) {
1880 		start = memlists[i].addr;
1881 #if defined(__xpv)
1882 		start += mfn_base;
1883 #endif
1884 		end = start + memlists[i].size;
1885 
1886 		/*
1887 		 * did we find the desired address?
1888 		 */
1889 		if (start <= next_avail_addr && next_avail_addr + size <= end) {
1890 			best = next_avail_addr;
1891 			goto done;
1892 		}
1893 
1894 		/*
1895 		 * if not is this address the best so far?
1896 		 */
1897 		if (start > next_avail_addr && start < best &&
1898 		    RNDUP(start, align) + size <= end)
1899 			best = RNDUP(start, align);
1900 	}
1901 
1902 	/*
1903 	 * We didn't find exactly the address we wanted, due to going off the
1904 	 * end of a memory region. Return the best found memory address.
1905 	 */
1906 done:
1907 	next_avail_addr = best + size;
1908 #if defined(__xpv)
1909 	if (next_avail_addr > scratch_end)
1910 		dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
1911 		    "0x%lx", (ulong_t)next_avail_addr,
1912 		    (ulong_t)scratch_end);
1913 #endif
1914 	(void) memset((void *)(uintptr_t)best, 0, size);
1915 	return ((void *)(uintptr_t)best);
1916 }
1917 
1918 void *
1919 mem_alloc(uint32_t size)
1920 {
1921 	return (do_mem_alloc(size, MMU_PAGESIZE));
1922 }
1923 
1924 
1925 /*
1926  * Build page tables to map all of memory used so far as well as the kernel.
1927  */
1928 static void
1929 build_page_tables(void)
1930 {
1931 	uint32_t psize;
1932 	uint32_t level;
1933 	uint32_t off;
1934 	uint64_t start;
1935 #if !defined(__xpv)
1936 	uint32_t i;
1937 	uint64_t end;
1938 #endif	/* __xpv */
1939 
1940 	/*
1941 	 * If we're on metal, we need to create the top level pagetable.
1942 	 */
1943 #if defined(__xpv)
1944 	top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1945 #else /* __xpv */
1946 	top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1947 #endif /* __xpv */
1948 	DBG((uintptr_t)top_page_table);
1949 
1950 	/*
1951 	 * Determine if we'll use large mappings for kernel, then map it.
1952 	 */
1953 	if (largepage_support) {
1954 		psize = lpagesize;
1955 		level = 1;
1956 	} else {
1957 		psize = MMU_PAGESIZE;
1958 		level = 0;
1959 	}
1960 
1961 	DBG_MSG("Mapping kernel\n");
1962 	DBG(ktext_phys);
1963 	DBG(target_kernel_text);
1964 	DBG(ksize);
1965 	DBG(psize);
1966 	for (off = 0; off < ksize; off += psize)
1967 		map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1968 
1969 	/*
1970 	 * The kernel will need a 1 page window to work with page tables
1971 	 */
1972 	bi->bi_pt_window = (native_ptr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1973 	DBG(bi->bi_pt_window);
1974 	bi->bi_pte_to_pt_window =
1975 	    (native_ptr_t)(uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1976 	DBG(bi->bi_pte_to_pt_window);
1977 
1978 #if defined(__xpv)
1979 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1980 		/* If this is a domU we're done. */
1981 		DBG_MSG("\nPage tables constructed\n");
1982 		return;
1983 	}
1984 #endif /* __xpv */
1985 
1986 	/*
1987 	 * We need 1:1 mappings for the lower 1M of memory to access
1988 	 * BIOS tables used by a couple of drivers during boot.
1989 	 *
1990 	 * The following code works because our simple memory allocator
1991 	 * only grows usage in an upwards direction.
1992 	 *
1993 	 * Note that by this point in boot some mappings for low memory
1994 	 * may already exist because we've already accessed device in low
1995 	 * memory.  (Specifically the video frame buffer and keyboard
1996 	 * status ports.)  If we're booting on raw hardware then GRUB
1997 	 * created these mappings for us.  If we're booting under a
1998 	 * hypervisor then we went ahead and remapped these devices into
1999 	 * memory allocated within dboot itself.
2000 	 */
2001 	if (map_debug)
2002 		dboot_printf("1:1 map pa=0..1Meg\n");
2003 	for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
2004 #if defined(__xpv)
2005 		map_ma_at_va(start, start, 0);
2006 #else /* __xpv */
2007 		map_pa_at_va(start, start, 0);
2008 #endif /* __xpv */
2009 	}
2010 
2011 #if !defined(__xpv)
2012 
2013 	for (i = 0; i < memlists_used; ++i) {
2014 		start = memlists[i].addr;
2015 		end = start + memlists[i].size;
2016 
2017 		if (map_debug)
2018 			dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
2019 			    start, end);
2020 		while (start < end && start < next_avail_addr) {
2021 			map_pa_at_va(start, start, 0);
2022 			start += MMU_PAGESIZE;
2023 		}
2024 		if (start >= next_avail_addr)
2025 			break;
2026 	}
2027 
2028 	/*
2029 	 * Map framebuffer memory as PT_NOCACHE as this is memory from a
2030 	 * device and therefore must not be cached.
2031 	 */
2032 	if (bi->bi_framebuffer != NULL) {
2033 		multiboot_tag_framebuffer_t *fb;
2034 		fb = (multiboot_tag_framebuffer_t *)(uintptr_t)
2035 		    bi->bi_framebuffer;
2036 
2037 		start = fb->framebuffer_common.framebuffer_addr;
2038 		end = start + fb->framebuffer_common.framebuffer_height *
2039 		    fb->framebuffer_common.framebuffer_pitch;
2040 
2041 		if (map_debug)
2042 			dboot_printf("FB 1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
2043 			    start, end);
2044 		pte_bits |= PT_NOCACHE;
2045 		while (start < end) {
2046 			map_pa_at_va(start, start, 0);
2047 			start += MMU_PAGESIZE;
2048 		}
2049 		pte_bits &= ~PT_NOCACHE;
2050 	}
2051 #endif /* !__xpv */
2052 
2053 	DBG_MSG("\nPage tables constructed\n");
2054 }
2055 
2056 #define	NO_MULTIBOOT	\
2057 "multiboot is no longer used to boot the Solaris Operating System.\n\
2058 The grub entry should be changed to:\n\
2059 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
2060 module$ /platform/i86pc/$ISADIR/boot_archive\n\
2061 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
2062 
2063 static void
2064 dboot_init_xboot_consinfo(void)
2065 {
2066 	uintptr_t addr;
2067 	/*
2068 	 * boot info must be 16 byte aligned for 64 bit kernel ABI
2069 	 */
2070 	addr = (uintptr_t)boot_info;
2071 	addr = (addr + 0xf) & ~0xf;
2072 	bi = (struct xboot_info *)addr;
2073 
2074 #if !defined(__xpv)
2075 	switch (multiboot_version) {
2076 	case 1:
2077 		dboot_multiboot1_xboot_consinfo();
2078 		break;
2079 	case 2:
2080 		dboot_multiboot2_xboot_consinfo();
2081 		break;
2082 	default:
2083 		dboot_panic("Unknown multiboot version: %d\n",
2084 		    multiboot_version);
2085 		break;
2086 	}
2087 	/*
2088 	 * Lookup environment module for the console. Complete module list
2089 	 * will be built after console setup.
2090 	 */
2091 	dboot_find_env();
2092 #endif
2093 }
2094 
2095 /*
2096  * Set up basic data from the boot loader.
2097  * The load_addr is part of AOUT kludge setup in dboot_grub.s, to support
2098  * 32-bit dboot code setup used to set up and start 64-bit kernel.
2099  * AOUT kludge does allow 32-bit boot loader, such as grub1, to load and
2100  * start 64-bit illumos kernel.
2101  */
2102 static void
2103 dboot_loader_init(void)
2104 {
2105 #if !defined(__xpv)
2106 	mb_info = NULL;
2107 	mb2_info = NULL;
2108 
2109 	switch (mb_magic) {
2110 	case MB_BOOTLOADER_MAGIC:
2111 		multiboot_version = 1;
2112 		mb_info = (multiboot_info_t *)(uintptr_t)mb_addr;
2113 #if defined(_BOOT_TARGET_amd64)
2114 		load_addr = mb_header.load_addr;
2115 #endif
2116 		break;
2117 
2118 	case MULTIBOOT2_BOOTLOADER_MAGIC:
2119 		multiboot_version = 2;
2120 		mb2_info = (multiboot2_info_header_t *)(uintptr_t)mb_addr;
2121 		mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mb2_info);
2122 #if defined(_BOOT_TARGET_amd64)
2123 		load_addr = mb2_load_addr;
2124 #endif
2125 		break;
2126 
2127 	default:
2128 		dboot_panic("Unknown bootloader magic: 0x%x\n", mb_magic);
2129 		break;
2130 	}
2131 #endif	/* !defined(__xpv) */
2132 }
2133 
2134 /* Extract the kernel command line from [multi]boot information. */
2135 static char *
2136 dboot_loader_cmdline(void)
2137 {
2138 	char *line = NULL;
2139 
2140 #if defined(__xpv)
2141 	line = (char *)xen_info->cmd_line;
2142 #else /* __xpv */
2143 
2144 	switch (multiboot_version) {
2145 	case 1:
2146 		if (mb_info->flags & MB_INFO_CMDLINE)
2147 			line = (char *)mb_info->cmdline;
2148 		break;
2149 
2150 	case 2:
2151 		line = dboot_multiboot2_cmdline(mb2_info);
2152 		break;
2153 
2154 	default:
2155 		dboot_panic("Unknown multiboot version: %d\n",
2156 		    multiboot_version);
2157 		break;
2158 	}
2159 
2160 #endif /* __xpv */
2161 
2162 	/*
2163 	 * Make sure we have valid pointer so the string operations
2164 	 * will not crash us.
2165 	 */
2166 	if (line == NULL)
2167 		line = "";
2168 
2169 	return (line);
2170 }
2171 
2172 static char *
2173 dboot_loader_name(void)
2174 {
2175 #if defined(__xpv)
2176 	return (NULL);
2177 #else /* __xpv */
2178 	multiboot_tag_string_t *tag;
2179 
2180 	switch (multiboot_version) {
2181 	case 1:
2182 		return ((char *)mb_info->boot_loader_name);
2183 
2184 	case 2:
2185 		tag = dboot_multiboot2_find_tag(mb2_info,
2186 		    MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME);
2187 		return (tag->mb_string);
2188 	default:
2189 		dboot_panic("Unknown multiboot version: %d\n",
2190 		    multiboot_version);
2191 		break;
2192 	}
2193 
2194 	return (NULL);
2195 #endif /* __xpv */
2196 }
2197 
2198 /*
2199  * startup_kernel has a pretty simple job. It builds pagetables which reflect
2200  * 1:1 mappings for all memory in use. It then also adds mappings for
2201  * the kernel nucleus at virtual address of target_kernel_text using large page
2202  * mappings. The page table pages are also accessible at 1:1 mapped
2203  * virtual addresses.
2204  */
2205 /*ARGSUSED*/
2206 void
2207 startup_kernel(void)
2208 {
2209 	char *cmdline;
2210 	char *bootloader;
2211 #if defined(__xpv)
2212 	physdev_set_iopl_t set_iopl;
2213 #endif /* __xpv */
2214 
2215 	if (dboot_debug == 1)
2216 		bcons_init(NULL);	/* Set very early console to ttya. */
2217 	dboot_loader_init();
2218 	/*
2219 	 * At this point we are executing in a 32 bit real mode.
2220 	 */
2221 
2222 	bootloader = dboot_loader_name();
2223 	cmdline = dboot_loader_cmdline();
2224 
2225 #if defined(__xpv)
2226 	/*
2227 	 * For dom0, before we initialize the console subsystem we'll
2228 	 * need to enable io operations, so set I/O priveldge level to 1.
2229 	 */
2230 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
2231 		set_iopl.iopl = 1;
2232 		(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
2233 	}
2234 #endif /* __xpv */
2235 
2236 	dboot_init_xboot_consinfo();
2237 	bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
2238 	bcons_init(bi);		/* Now we can set the real console. */
2239 
2240 	prom_debug = (find_boot_prop("prom_debug") != NULL);
2241 	map_debug = (find_boot_prop("map_debug") != NULL);
2242 
2243 #if !defined(__xpv)
2244 	dboot_multiboot_get_fwtables();
2245 #endif
2246 	DBG_MSG("\n\nillumos prekernel set: ");
2247 	DBG_MSG(cmdline);
2248 	DBG_MSG("\n");
2249 
2250 	if (bootloader != NULL && prom_debug) {
2251 		dboot_printf("Kernel loaded by: %s\n", bootloader);
2252 #if !defined(__xpv)
2253 		dboot_printf("Using multiboot %d boot protocol.\n",
2254 		    multiboot_version);
2255 #endif
2256 	}
2257 
2258 	if (strstr(cmdline, "multiboot") != NULL) {
2259 		dboot_panic(NO_MULTIBOOT);
2260 	}
2261 
2262 	DBG((uintptr_t)bi);
2263 #if !defined(__xpv)
2264 	DBG((uintptr_t)mb_info);
2265 	DBG((uintptr_t)mb2_info);
2266 	if (mb2_info != NULL)
2267 		DBG(mb2_info->mbi_total_size);
2268 	DBG(bi->bi_acpi_rsdp);
2269 	DBG(bi->bi_smbios);
2270 	DBG(bi->bi_uefi_arch);
2271 	DBG(bi->bi_uefi_systab);
2272 
2273 	if (bi->bi_uefi_systab && prom_debug) {
2274 		if (bi->bi_uefi_arch == XBI_UEFI_ARCH_64) {
2275 			print_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
2276 			    bi->bi_uefi_systab);
2277 		} else {
2278 			print_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
2279 			    bi->bi_uefi_systab);
2280 		}
2281 	}
2282 #endif
2283 
2284 	/*
2285 	 * Need correct target_kernel_text value
2286 	 */
2287 #if defined(_BOOT_TARGET_amd64)
2288 	target_kernel_text = KERNEL_TEXT_amd64;
2289 #elif defined(__xpv)
2290 	target_kernel_text = KERNEL_TEXT_i386_xpv;
2291 #else
2292 	target_kernel_text = KERNEL_TEXT_i386;
2293 #endif
2294 	DBG(target_kernel_text);
2295 
2296 #if defined(__xpv)
2297 
2298 	/*
2299 	 * XXPV	Derive this stuff from CPUID / what the hypervisor has enabled
2300 	 */
2301 
2302 #if defined(_BOOT_TARGET_amd64)
2303 	/*
2304 	 * 64-bit hypervisor.
2305 	 */
2306 	amd64_support = 1;
2307 	pae_support = 1;
2308 
2309 #else	/* _BOOT_TARGET_amd64 */
2310 
2311 	/*
2312 	 * See if we are running on a PAE Hypervisor
2313 	 */
2314 	{
2315 		xen_capabilities_info_t caps;
2316 
2317 		if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
2318 			dboot_panic("HYPERVISOR_xen_version(caps) failed");
2319 		caps[sizeof (caps) - 1] = 0;
2320 		if (prom_debug)
2321 			dboot_printf("xen capabilities %s\n", caps);
2322 		if (strstr(caps, "x86_32p") != NULL)
2323 			pae_support = 1;
2324 	}
2325 
2326 #endif	/* _BOOT_TARGET_amd64 */
2327 	{
2328 		xen_platform_parameters_t p;
2329 
2330 		if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
2331 			dboot_panic("HYPERVISOR_xen_version(parms) failed");
2332 		DBG(p.virt_start);
2333 		mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
2334 	}
2335 
2336 	/*
2337 	 * The hypervisor loads stuff starting at 1Gig
2338 	 */
2339 	mfn_base = ONE_GIG;
2340 	DBG(mfn_base);
2341 
2342 	/*
2343 	 * enable writable page table mode for the hypervisor
2344 	 */
2345 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2346 	    VMASST_TYPE_writable_pagetables) < 0)
2347 		dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
2348 
2349 	/*
2350 	 * check for NX support
2351 	 */
2352 	if (pae_support) {
2353 		uint32_t eax = 0x80000000;
2354 		uint32_t edx = get_cpuid_edx(&eax);
2355 
2356 		if (eax >= 0x80000001) {
2357 			eax = 0x80000001;
2358 			edx = get_cpuid_edx(&eax);
2359 			if (edx & CPUID_AMD_EDX_NX)
2360 				NX_support = 1;
2361 		}
2362 	}
2363 
2364 #if !defined(_BOOT_TARGET_amd64)
2365 
2366 	/*
2367 	 * The 32-bit hypervisor uses segmentation to protect itself from
2368 	 * guests. This means when a guest attempts to install a flat 4GB
2369 	 * code or data descriptor the 32-bit hypervisor will protect itself
2370 	 * by silently shrinking the segment such that if the guest attempts
2371 	 * any access where the hypervisor lives a #gp fault is generated.
2372 	 * The problem is that some applications expect a full 4GB flat
2373 	 * segment for their current thread pointer and will use negative
2374 	 * offset segment wrap around to access data. TLS support in linux
2375 	 * brand is one example of this.
2376 	 *
2377 	 * The 32-bit hypervisor can catch the #gp fault in these cases
2378 	 * and emulate the access without passing the #gp fault to the guest
2379 	 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
2380 	 * Seems like this should have been the default.
2381 	 * Either way, we want the hypervisor -- and not Solaris -- to deal
2382 	 * to deal with emulating these accesses.
2383 	 */
2384 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2385 	    VMASST_TYPE_4gb_segments) < 0)
2386 		dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
2387 #endif	/* !_BOOT_TARGET_amd64 */
2388 
2389 #else	/* __xpv */
2390 
2391 	/*
2392 	 * use cpuid to enable MMU features
2393 	 */
2394 	if (have_cpuid()) {
2395 		uint32_t eax, edx;
2396 
2397 		eax = 1;
2398 		edx = get_cpuid_edx(&eax);
2399 		if (edx & CPUID_INTC_EDX_PSE)
2400 			largepage_support = 1;
2401 		if (edx & CPUID_INTC_EDX_PGE)
2402 			pge_support = 1;
2403 		if (edx & CPUID_INTC_EDX_PAE)
2404 			pae_support = 1;
2405 
2406 		eax = 0x80000000;
2407 		edx = get_cpuid_edx(&eax);
2408 		if (eax >= 0x80000001) {
2409 			eax = 0x80000001;
2410 			edx = get_cpuid_edx(&eax);
2411 			if (edx & CPUID_AMD_EDX_LM)
2412 				amd64_support = 1;
2413 			if (edx & CPUID_AMD_EDX_NX)
2414 				NX_support = 1;
2415 		}
2416 	} else {
2417 		dboot_printf("cpuid not supported\n");
2418 	}
2419 #endif /* __xpv */
2420 
2421 
2422 #if defined(_BOOT_TARGET_amd64)
2423 	if (amd64_support == 0)
2424 		dboot_panic("long mode not supported, rebooting");
2425 	else if (pae_support == 0)
2426 		dboot_panic("long mode, but no PAE; rebooting");
2427 #else
2428 	/*
2429 	 * Allow the command line to over-ride use of PAE for 32 bit.
2430 	 */
2431 	if (strstr(cmdline, "disablePAE=true") != NULL) {
2432 		pae_support = 0;
2433 		NX_support = 0;
2434 		amd64_support = 0;
2435 	}
2436 #endif
2437 
2438 	/*
2439 	 * initialize the simple memory allocator
2440 	 */
2441 	init_mem_alloc();
2442 
2443 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
2444 	/*
2445 	 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
2446 	 */
2447 	if (max_mem < FOUR_GIG && NX_support == 0)
2448 		pae_support = 0;
2449 #endif
2450 
2451 	/*
2452 	 * configure mmu information
2453 	 */
2454 	if (pae_support) {
2455 		shift_amt = shift_amt_pae;
2456 		ptes_per_table = 512;
2457 		pte_size = 8;
2458 		lpagesize = TWO_MEG;
2459 #if defined(_BOOT_TARGET_amd64)
2460 		top_level = 3;
2461 #else
2462 		top_level = 2;
2463 #endif
2464 	} else {
2465 		pae_support = 0;
2466 		NX_support = 0;
2467 		shift_amt = shift_amt_nopae;
2468 		ptes_per_table = 1024;
2469 		pte_size = 4;
2470 		lpagesize = FOUR_MEG;
2471 		top_level = 1;
2472 	}
2473 
2474 	DBG(pge_support);
2475 	DBG(NX_support);
2476 	DBG(largepage_support);
2477 	DBG(amd64_support);
2478 	DBG(top_level);
2479 	DBG(pte_size);
2480 	DBG(ptes_per_table);
2481 	DBG(lpagesize);
2482 
2483 #if defined(__xpv)
2484 	ktext_phys = ONE_GIG;		/* from UNIX Mapfile */
2485 #else
2486 	ktext_phys = FOUR_MEG;		/* from UNIX Mapfile */
2487 #endif
2488 
2489 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
2490 	/*
2491 	 * For grub, copy kernel bits from the ELF64 file to final place.
2492 	 */
2493 	DBG_MSG("\nAllocating nucleus pages.\n");
2494 	ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
2495 
2496 	if (ktext_phys == 0)
2497 		dboot_panic("failed to allocate aligned kernel memory");
2498 	DBG(load_addr);
2499 	if (dboot_elfload64(load_addr) != 0)
2500 		dboot_panic("failed to parse kernel ELF image, rebooting");
2501 #endif
2502 
2503 	DBG(ktext_phys);
2504 
2505 	/*
2506 	 * Allocate page tables.
2507 	 */
2508 	build_page_tables();
2509 
2510 	/*
2511 	 * return to assembly code to switch to running kernel
2512 	 */
2513 	entry_addr_low = (uint32_t)target_kernel_text;
2514 	DBG(entry_addr_low);
2515 	bi->bi_use_largepage = largepage_support;
2516 	bi->bi_use_pae = pae_support;
2517 	bi->bi_use_pge = pge_support;
2518 	bi->bi_use_nx = NX_support;
2519 
2520 #if defined(__xpv)
2521 
2522 	bi->bi_next_paddr = next_avail_addr - mfn_base;
2523 	DBG(bi->bi_next_paddr);
2524 	bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2525 	DBG(bi->bi_next_vaddr);
2526 
2527 	/*
2528 	 * unmap unused pages in start area to make them available for DMA
2529 	 */
2530 	while (next_avail_addr < scratch_end) {
2531 		(void) HYPERVISOR_update_va_mapping(next_avail_addr,
2532 		    0, UVMF_INVLPG | UVMF_LOCAL);
2533 		next_avail_addr += MMU_PAGESIZE;
2534 	}
2535 
2536 	bi->bi_xen_start_info = (native_ptr_t)(uintptr_t)xen_info;
2537 	DBG((uintptr_t)HYPERVISOR_shared_info);
2538 	bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
2539 	bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
2540 
2541 #else /* __xpv */
2542 
2543 	bi->bi_next_paddr = next_avail_addr;
2544 	DBG(bi->bi_next_paddr);
2545 	bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2546 	DBG(bi->bi_next_vaddr);
2547 	bi->bi_mb_version = multiboot_version;
2548 
2549 	switch (multiboot_version) {
2550 	case 1:
2551 		bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb_info;
2552 		break;
2553 	case 2:
2554 		bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb2_info;
2555 		break;
2556 	default:
2557 		dboot_panic("Unknown multiboot version: %d\n",
2558 		    multiboot_version);
2559 		break;
2560 	}
2561 	bi->bi_top_page_table = (uintptr_t)top_page_table;
2562 
2563 #endif /* __xpv */
2564 
2565 	bi->bi_kseg_size = FOUR_MEG;
2566 	DBG(bi->bi_kseg_size);
2567 
2568 #ifndef __xpv
2569 	if (map_debug)
2570 		dump_tables();
2571 #endif
2572 
2573 	DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
2574 }
2575