xref: /illumos-gate/usr/src/uts/i86pc/dboot/dboot_startkern.c (revision f362c74cdda7b4819bb5d3360149ac0fae9ea013)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright 2013 Joyent, Inc.  All rights reserved.
27  */
28 
29 
30 #include <sys/types.h>
31 #include <sys/machparam.h>
32 #include <sys/x86_archext.h>
33 #include <sys/systm.h>
34 #include <sys/mach_mmu.h>
35 #include <sys/multiboot.h>
36 #include <sys/multiboot2.h>
37 #include <sys/multiboot2_impl.h>
38 #include <sys/sysmacros.h>
39 #include <sys/sha1.h>
40 #include <util/string.h>
41 #include <util/strtolctype.h>
42 #include <sys/efi.h>
43 
44 #if defined(__xpv)
45 
46 #include <sys/hypervisor.h>
47 uintptr_t xen_virt_start;
48 pfn_t *mfn_to_pfn_mapping;
49 
50 #else /* !__xpv */
51 
52 extern multiboot_header_t mb_header;
53 extern uint32_t mb2_load_addr;
54 extern int have_cpuid(void);
55 
56 #endif /* !__xpv */
57 
58 #include <sys/inttypes.h>
59 #include <sys/bootinfo.h>
60 #include <sys/mach_mmu.h>
61 #include <sys/boot_console.h>
62 
63 #include "dboot_asm.h"
64 #include "dboot_printf.h"
65 #include "dboot_xboot.h"
66 #include "dboot_elfload.h"
67 
68 #define	SHA1_ASCII_LENGTH	(SHA1_DIGEST_LENGTH * 2)
69 
70 /*
71  * This file contains code that runs to transition us from either a multiboot
72  * compliant loader (32 bit non-paging) or a XPV domain loader to
73  * regular kernel execution. Its task is to setup the kernel memory image
74  * and page tables.
75  *
76  * The code executes as:
77  *	- 32 bits under GRUB (for 32 or 64 bit Solaris)
78  * 	- a 32 bit program for the 32-bit PV hypervisor
79  *	- a 64 bit program for the 64-bit PV hypervisor (at least for now)
80  *
81  * Under the PV hypervisor, we must create mappings for any memory beyond the
82  * initial start of day allocation (such as the kernel itself).
83  *
84  * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
85  * Since we are running in real mode, so all such memory is accessible.
86  */
87 
88 /*
89  * Standard bits used in PTE (page level) and PTP (internal levels)
90  */
91 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
92 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
93 
94 /*
95  * This is the target addresses (physical) where the kernel text and data
96  * nucleus pages will be unpacked. On the hypervisor this is actually a
97  * virtual address.
98  */
99 paddr_t ktext_phys;
100 uint32_t ksize = 2 * FOUR_MEG;	/* kernel nucleus is 8Meg */
101 
102 static uint64_t target_kernel_text;	/* value to use for KERNEL_TEXT */
103 
104 /*
105  * The stack is setup in assembler before entering startup_kernel()
106  */
107 char stack_space[STACK_SIZE];
108 
109 /*
110  * Used to track physical memory allocation
111  */
112 static paddr_t next_avail_addr = 0;
113 
114 #if defined(__xpv)
115 /*
116  * Additional information needed for hypervisor memory allocation.
117  * Only memory up to scratch_end is mapped by page tables.
118  * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
119  * to derive a pfn from a pointer, you subtract mfn_base.
120  */
121 
122 static paddr_t scratch_end = 0;	/* we can't write all of mem here */
123 static paddr_t mfn_base;		/* addr corresponding to mfn_list[0] */
124 start_info_t *xen_info;
125 
126 #else	/* __xpv */
127 
128 /*
129  * If on the metal, then we have a multiboot loader.
130  */
131 uint32_t mb_magic;			/* magic from boot loader */
132 uint32_t mb_addr;			/* multiboot info package from loader */
133 int multiboot_version;
134 multiboot_info_t *mb_info;
135 multiboot2_info_header_t *mb2_info;
136 multiboot_tag_mmap_t *mb2_mmap_tagp;
137 int num_entries;			/* mmap entry count */
138 boolean_t num_entries_set;		/* is mmap entry count set */
139 uintptr_t load_addr;
140 
141 /* can not be automatic variables because of alignment */
142 static efi_guid_t smbios3 = SMBIOS3_TABLE_GUID;
143 static efi_guid_t smbios = SMBIOS_TABLE_GUID;
144 static efi_guid_t acpi2 = EFI_ACPI_TABLE_GUID;
145 static efi_guid_t acpi1 = ACPI_10_TABLE_GUID;
146 #endif	/* __xpv */
147 
148 /*
149  * This contains information passed to the kernel
150  */
151 struct xboot_info boot_info[2];	/* extra space to fix alignement for amd64 */
152 struct xboot_info *bi;
153 
154 /*
155  * Page table and memory stuff.
156  */
157 static paddr_t max_mem;			/* maximum memory address */
158 
159 /*
160  * Information about processor MMU
161  */
162 int amd64_support = 0;
163 int largepage_support = 0;
164 int pae_support = 0;
165 int pge_support = 0;
166 int NX_support = 0;
167 
168 /*
169  * Low 32 bits of kernel entry address passed back to assembler.
170  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
171  */
172 uint32_t entry_addr_low;
173 
174 /*
175  * Memlists for the kernel. We shouldn't need a lot of these.
176  */
177 #define	MAX_MEMLIST (50)
178 struct boot_memlist memlists[MAX_MEMLIST];
179 uint_t memlists_used = 0;
180 struct boot_memlist pcimemlists[MAX_MEMLIST];
181 uint_t pcimemlists_used = 0;
182 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
183 uint_t rsvdmemlists_used = 0;
184 
185 /*
186  * This should match what's in the bootloader.  It's arbitrary, but GRUB
187  * in particular has limitations on how much space it can use before it
188  * stops working properly.  This should be enough.
189  */
190 struct boot_modules modules[MAX_BOOT_MODULES];
191 uint_t modules_used = 0;
192 
193 #ifdef __xpv
194 /*
195  * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
196  * definition in Xen source.
197  */
198 typedef struct {
199 	uint32_t	base_addr_low;
200 	uint32_t	base_addr_high;
201 	uint32_t	length_low;
202 	uint32_t	length_high;
203 	uint32_t	type;
204 } mmap_t;
205 
206 /*
207  * There is 512KB of scratch area after the boot stack page.
208  * We'll use that for everything except the kernel nucleus pages which are too
209  * big to fit there and are allocated last anyway.
210  */
211 #define	MAXMAPS	100
212 static mmap_t map_buffer[MAXMAPS];
213 #else
214 typedef mb_memory_map_t mmap_t;
215 #endif
216 
217 /*
218  * Debugging macros
219  */
220 uint_t prom_debug = 0;
221 uint_t map_debug = 0;
222 
223 static char noname[2] = "-";
224 
225 /*
226  * Either hypervisor-specific or grub-specific code builds the initial
227  * memlists. This code does the sort/merge/link for final use.
228  */
229 static void
230 sort_physinstall(void)
231 {
232 	int i;
233 #if !defined(__xpv)
234 	int j;
235 	struct boot_memlist tmp;
236 
237 	/*
238 	 * Now sort the memlists, in case they weren't in order.
239 	 * Yeah, this is a bubble sort; small, simple and easy to get right.
240 	 */
241 	DBG_MSG("Sorting phys-installed list\n");
242 	for (j = memlists_used - 1; j > 0; --j) {
243 		for (i = 0; i < j; ++i) {
244 			if (memlists[i].addr < memlists[i + 1].addr)
245 				continue;
246 			tmp = memlists[i];
247 			memlists[i] = memlists[i + 1];
248 			memlists[i + 1] = tmp;
249 		}
250 	}
251 
252 	/*
253 	 * Merge any memlists that don't have holes between them.
254 	 */
255 	for (i = 0; i <= memlists_used - 1; ++i) {
256 		if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
257 			continue;
258 
259 		if (prom_debug)
260 			dboot_printf(
261 			    "merging mem segs %" PRIx64 "...%" PRIx64
262 			    " w/ %" PRIx64 "...%" PRIx64 "\n",
263 			    memlists[i].addr,
264 			    memlists[i].addr + memlists[i].size,
265 			    memlists[i + 1].addr,
266 			    memlists[i + 1].addr + memlists[i + 1].size);
267 
268 		memlists[i].size += memlists[i + 1].size;
269 		for (j = i + 1; j < memlists_used - 1; ++j)
270 			memlists[j] = memlists[j + 1];
271 		--memlists_used;
272 		DBG(memlists_used);
273 		--i;	/* after merging we need to reexamine, so do this */
274 	}
275 #endif	/* __xpv */
276 
277 	if (prom_debug) {
278 		dboot_printf("\nFinal memlists:\n");
279 		for (i = 0; i < memlists_used; ++i) {
280 			dboot_printf("\t%d: addr=%" PRIx64 " size=%"
281 			    PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
282 		}
283 	}
284 
285 	/*
286 	 * link together the memlists with native size pointers
287 	 */
288 	memlists[0].next = 0;
289 	memlists[0].prev = 0;
290 	for (i = 1; i < memlists_used; ++i) {
291 		memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
292 		memlists[i].next = 0;
293 		memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
294 	}
295 	bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
296 	DBG(bi->bi_phys_install);
297 }
298 
299 /*
300  * build bios reserved memlists
301  */
302 static void
303 build_rsvdmemlists(void)
304 {
305 	int i;
306 
307 	rsvdmemlists[0].next = 0;
308 	rsvdmemlists[0].prev = 0;
309 	for (i = 1; i < rsvdmemlists_used; ++i) {
310 		rsvdmemlists[i].prev =
311 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
312 		rsvdmemlists[i].next = 0;
313 		rsvdmemlists[i - 1].next =
314 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
315 	}
316 	bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
317 	DBG(bi->bi_rsvdmem);
318 }
319 
320 #if defined(__xpv)
321 
322 /*
323  * halt on the hypervisor after a delay to drain console output
324  */
325 void
326 dboot_halt(void)
327 {
328 	uint_t i = 10000;
329 
330 	while (--i)
331 		(void) HYPERVISOR_yield();
332 	(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
333 }
334 
335 /*
336  * From a machine address, find the corresponding pseudo-physical address.
337  * Pseudo-physical address are contiguous and run from mfn_base in each VM.
338  * Machine addresses are the real underlying hardware addresses.
339  * These are needed for page table entries. Note that this routine is
340  * poorly protected. A bad value of "ma" will cause a page fault.
341  */
342 paddr_t
343 ma_to_pa(maddr_t ma)
344 {
345 	ulong_t pgoff = ma & MMU_PAGEOFFSET;
346 	ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
347 	paddr_t pa;
348 
349 	if (pfn >= xen_info->nr_pages)
350 		return (-(paddr_t)1);
351 	pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
352 #ifdef DEBUG
353 	if (ma != pa_to_ma(pa))
354 		dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
355 		    "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
356 #endif
357 	return (pa);
358 }
359 
360 /*
361  * From a pseudo-physical address, find the corresponding machine address.
362  */
363 maddr_t
364 pa_to_ma(paddr_t pa)
365 {
366 	pfn_t pfn;
367 	ulong_t mfn;
368 
369 	pfn = mmu_btop(pa - mfn_base);
370 	if (pa < mfn_base || pfn >= xen_info->nr_pages)
371 		dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
372 	mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
373 #ifdef DEBUG
374 	if (mfn_to_pfn_mapping[mfn] != pfn)
375 		dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
376 		    pfn, mfn, mfn_to_pfn_mapping[mfn]);
377 #endif
378 	return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
379 }
380 
381 #endif	/* __xpv */
382 
383 x86pte_t
384 get_pteval(paddr_t table, uint_t index)
385 {
386 	if (pae_support)
387 		return (((x86pte_t *)(uintptr_t)table)[index]);
388 	return (((x86pte32_t *)(uintptr_t)table)[index]);
389 }
390 
391 /*ARGSUSED*/
392 void
393 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
394 {
395 #ifdef __xpv
396 	mmu_update_t t;
397 	maddr_t mtable = pa_to_ma(table);
398 	int retcnt;
399 
400 	t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
401 	t.val = pteval;
402 	if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
403 		dboot_panic("HYPERVISOR_mmu_update() failed");
404 #else /* __xpv */
405 	uintptr_t tab_addr = (uintptr_t)table;
406 
407 	if (pae_support)
408 		((x86pte_t *)tab_addr)[index] = pteval;
409 	else
410 		((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
411 	if (level == top_level && level == 2)
412 		reload_cr3();
413 #endif /* __xpv */
414 }
415 
416 paddr_t
417 make_ptable(x86pte_t *pteval, uint_t level)
418 {
419 	paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
420 
421 	if (level == top_level && level == 2)
422 		*pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
423 	else
424 		*pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
425 
426 #ifdef __xpv
427 	/* Remove write permission to the new page table. */
428 	if (HYPERVISOR_update_va_mapping(new_table,
429 	    *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
430 		dboot_panic("HYP_update_va_mapping error");
431 #endif
432 
433 	if (map_debug)
434 		dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
435 		    PRIx64 "\n", level, (ulong_t)new_table, *pteval);
436 	return (new_table);
437 }
438 
439 x86pte_t *
440 map_pte(paddr_t table, uint_t index)
441 {
442 	return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
443 }
444 
445 /*
446  * dump out the contents of page tables...
447  */
448 static void
449 dump_tables(void)
450 {
451 	uint_t save_index[4];	/* for recursion */
452 	char *save_table[4];	/* for recursion */
453 	uint_t	l;
454 	uint64_t va;
455 	uint64_t pgsize;
456 	int index;
457 	int i;
458 	x86pte_t pteval;
459 	char *table;
460 	static char *tablist = "\t\t\t";
461 	char *tabs = tablist + 3 - top_level;
462 	uint_t pa, pa1;
463 #if !defined(__xpv)
464 #define	maddr_t paddr_t
465 #endif /* !__xpv */
466 
467 	dboot_printf("Finished pagetables:\n");
468 	table = (char *)(uintptr_t)top_page_table;
469 	l = top_level;
470 	va = 0;
471 	for (index = 0; index < ptes_per_table; ++index) {
472 		pgsize = 1ull << shift_amt[l];
473 		if (pae_support)
474 			pteval = ((x86pte_t *)table)[index];
475 		else
476 			pteval = ((x86pte32_t *)table)[index];
477 		if (pteval == 0)
478 			goto next_entry;
479 
480 		dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
481 		    tabs + l, (void *)table, index, (uint64_t)pteval, va);
482 		pa = ma_to_pa(pteval & MMU_PAGEMASK);
483 		dboot_printf(" physaddr=%x\n", pa);
484 
485 		/*
486 		 * Don't try to walk hypervisor private pagetables
487 		 */
488 		if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
489 			save_table[l] = table;
490 			save_index[l] = index;
491 			--l;
492 			index = -1;
493 			table = (char *)(uintptr_t)
494 			    ma_to_pa(pteval & MMU_PAGEMASK);
495 			goto recursion;
496 		}
497 
498 		/*
499 		 * shorten dump for consecutive mappings
500 		 */
501 		for (i = 1; index + i < ptes_per_table; ++i) {
502 			if (pae_support)
503 				pteval = ((x86pte_t *)table)[index + i];
504 			else
505 				pteval = ((x86pte32_t *)table)[index + i];
506 			if (pteval == 0)
507 				break;
508 			pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
509 			if (pa1 != pa + i * pgsize)
510 				break;
511 		}
512 		if (i > 2) {
513 			dboot_printf("%s...\n", tabs + l);
514 			va += pgsize * (i - 2);
515 			index += i - 2;
516 		}
517 next_entry:
518 		va += pgsize;
519 		if (l == 3 && index == 256)	/* VA hole */
520 			va = 0xffff800000000000ull;
521 recursion:
522 		;
523 	}
524 	if (l < top_level) {
525 		++l;
526 		index = save_index[l];
527 		table = save_table[l];
528 		goto recursion;
529 	}
530 }
531 
532 /*
533  * Add a mapping for the machine page at the given virtual address.
534  */
535 static void
536 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
537 {
538 	x86pte_t *ptep;
539 	x86pte_t pteval;
540 
541 	pteval = ma | pte_bits;
542 	if (level > 0)
543 		pteval |= PT_PAGESIZE;
544 	if (va >= target_kernel_text && pge_support)
545 		pteval |= PT_GLOBAL;
546 
547 	if (map_debug && ma != va)
548 		dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
549 		    " pte=0x%" PRIx64 " l=%d\n",
550 		    (uint64_t)ma, (uint64_t)va, pteval, level);
551 
552 #if defined(__xpv)
553 	/*
554 	 * see if we can avoid find_pte() on the hypervisor
555 	 */
556 	if (HYPERVISOR_update_va_mapping(va, pteval,
557 	    UVMF_INVLPG | UVMF_LOCAL) == 0)
558 		return;
559 #endif
560 
561 	/*
562 	 * Find the pte that will map this address. This creates any
563 	 * missing intermediate level page tables
564 	 */
565 	ptep = find_pte(va, NULL, level, 0);
566 
567 	/*
568 	 * When paravirtualized, we must use hypervisor calls to modify the
569 	 * PTE, since paging is active. On real hardware we just write to
570 	 * the pagetables which aren't in use yet.
571 	 */
572 #if defined(__xpv)
573 	ptep = ptep;	/* shut lint up */
574 	if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
575 		dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
576 		    " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
577 		    (uint64_t)va, level, (uint64_t)ma, pteval);
578 #else
579 	if (va < 1024 * 1024)
580 		pteval |= PT_NOCACHE;		/* for video RAM */
581 	if (pae_support)
582 		*ptep = pteval;
583 	else
584 		*((x86pte32_t *)ptep) = (x86pte32_t)pteval;
585 #endif
586 }
587 
588 /*
589  * Add a mapping for the physical page at the given virtual address.
590  */
591 static void
592 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
593 {
594 	map_ma_at_va(pa_to_ma(pa), va, level);
595 }
596 
597 /*
598  * This is called to remove start..end from the
599  * possible range of PCI addresses.
600  */
601 const uint64_t pci_lo_limit = 0x00100000ul;
602 const uint64_t pci_hi_limit = 0xfff00000ul;
603 static void
604 exclude_from_pci(uint64_t start, uint64_t end)
605 {
606 	int i;
607 	int j;
608 	struct boot_memlist *ml;
609 
610 	for (i = 0; i < pcimemlists_used; ++i) {
611 		ml = &pcimemlists[i];
612 
613 		/* delete the entire range? */
614 		if (start <= ml->addr && ml->addr + ml->size <= end) {
615 			--pcimemlists_used;
616 			for (j = i; j < pcimemlists_used; ++j)
617 				pcimemlists[j] = pcimemlists[j + 1];
618 			--i;	/* to revisit the new one at this index */
619 		}
620 
621 		/* split a range? */
622 		else if (ml->addr < start && end < ml->addr + ml->size) {
623 
624 			++pcimemlists_used;
625 			if (pcimemlists_used > MAX_MEMLIST)
626 				dboot_panic("too many pcimemlists");
627 
628 			for (j = pcimemlists_used - 1; j > i; --j)
629 				pcimemlists[j] = pcimemlists[j - 1];
630 			ml->size = start - ml->addr;
631 
632 			++ml;
633 			ml->size = (ml->addr + ml->size) - end;
634 			ml->addr = end;
635 			++i;	/* skip on to next one */
636 		}
637 
638 		/* cut memory off the start? */
639 		else if (ml->addr < end && end < ml->addr + ml->size) {
640 			ml->size -= end - ml->addr;
641 			ml->addr = end;
642 		}
643 
644 		/* cut memory off the end? */
645 		else if (ml->addr <= start && start < ml->addr + ml->size) {
646 			ml->size = start - ml->addr;
647 		}
648 	}
649 }
650 
651 /*
652  * During memory allocation, find the highest address not used yet.
653  */
654 static void
655 check_higher(paddr_t a)
656 {
657 	if (a < next_avail_addr)
658 		return;
659 	next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
660 	DBG(next_avail_addr);
661 }
662 
663 static int
664 dboot_loader_mmap_entries(void)
665 {
666 #if !defined(__xpv)
667 	if (num_entries_set == B_TRUE)
668 		return (num_entries);
669 
670 	switch (multiboot_version) {
671 	case 1:
672 		DBG(mb_info->flags);
673 		if (mb_info->flags & 0x40) {
674 			mb_memory_map_t *mmap;
675 
676 			DBG(mb_info->mmap_addr);
677 			DBG(mb_info->mmap_length);
678 			check_higher(mb_info->mmap_addr + mb_info->mmap_length);
679 
680 			for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
681 			    (uint32_t)mmap < mb_info->mmap_addr +
682 			    mb_info->mmap_length;
683 			    mmap = (mb_memory_map_t *)((uint32_t)mmap +
684 			    mmap->size + sizeof (mmap->size)))
685 				++num_entries;
686 
687 			num_entries_set = B_TRUE;
688 		}
689 		break;
690 	case 2:
691 		num_entries_set = B_TRUE;
692 		num_entries = dboot_multiboot2_mmap_nentries(mb2_info,
693 		    mb2_mmap_tagp);
694 		break;
695 	default:
696 		dboot_panic("Unknown multiboot version: %d\n",
697 		    multiboot_version);
698 		break;
699 	}
700 	return (num_entries);
701 #else
702 	return (MAXMAPS);
703 #endif
704 }
705 
706 static uint32_t
707 dboot_loader_mmap_get_type(int index)
708 {
709 #if !defined(__xpv)
710 	mb_memory_map_t *mp, *mpend;
711 	int i;
712 
713 	switch (multiboot_version) {
714 	case 1:
715 		mp = (mb_memory_map_t *)mb_info->mmap_addr;
716 		mpend = (mb_memory_map_t *)
717 		    (mb_info->mmap_addr + mb_info->mmap_length);
718 
719 		for (i = 0; mp < mpend && i != index; i++)
720 			mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
721 			    sizeof (mp->size));
722 		if (mp >= mpend) {
723 			dboot_panic("dboot_loader_mmap_get_type(): index "
724 			    "out of bounds: %d\n", index);
725 		}
726 		return (mp->type);
727 
728 	case 2:
729 		return (dboot_multiboot2_mmap_get_type(mb2_info,
730 		    mb2_mmap_tagp, index));
731 
732 	default:
733 		dboot_panic("Unknown multiboot version: %d\n",
734 		    multiboot_version);
735 		break;
736 	}
737 	return (0);
738 #else
739 	return (map_buffer[index].type);
740 #endif
741 }
742 
743 static uint64_t
744 dboot_loader_mmap_get_base(int index)
745 {
746 #if !defined(__xpv)
747 	mb_memory_map_t *mp, *mpend;
748 	int i;
749 
750 	switch (multiboot_version) {
751 	case 1:
752 		mp = (mb_memory_map_t *)mb_info->mmap_addr;
753 		mpend = (mb_memory_map_t *)
754 		    (mb_info->mmap_addr + mb_info->mmap_length);
755 
756 		for (i = 0; mp < mpend && i != index; i++)
757 			mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
758 			    sizeof (mp->size));
759 		if (mp >= mpend) {
760 			dboot_panic("dboot_loader_mmap_get_base(): index "
761 			    "out of bounds: %d\n", index);
762 		}
763 		return (((uint64_t)mp->base_addr_high << 32) +
764 		    (uint64_t)mp->base_addr_low);
765 
766 	case 2:
767 		return (dboot_multiboot2_mmap_get_base(mb2_info,
768 		    mb2_mmap_tagp, index));
769 
770 	default:
771 		dboot_panic("Unknown multiboot version: %d\n",
772 		    multiboot_version);
773 		break;
774 	}
775 	return (0);
776 #else
777 	return (((uint64_t)map_buffer[index].base_addr_high << 32) +
778 	    (uint64_t)map_buffer[index].base_addr_low);
779 #endif
780 }
781 
782 static uint64_t
783 dboot_loader_mmap_get_length(int index)
784 {
785 #if !defined(__xpv)
786 	mb_memory_map_t *mp, *mpend;
787 	int i;
788 
789 	switch (multiboot_version) {
790 	case 1:
791 		mp = (mb_memory_map_t *)mb_info->mmap_addr;
792 		mpend = (mb_memory_map_t *)
793 		    (mb_info->mmap_addr + mb_info->mmap_length);
794 
795 		for (i = 0; mp < mpend && i != index; i++)
796 			mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
797 			    sizeof (mp->size));
798 		if (mp >= mpend) {
799 			dboot_panic("dboot_loader_mmap_get_length(): index "
800 			    "out of bounds: %d\n", index);
801 		}
802 		return (((uint64_t)mp->length_high << 32) +
803 		    (uint64_t)mp->length_low);
804 
805 	case 2:
806 		return (dboot_multiboot2_mmap_get_length(mb2_info,
807 		    mb2_mmap_tagp, index));
808 
809 	default:
810 		dboot_panic("Unknown multiboot version: %d\n",
811 		    multiboot_version);
812 		break;
813 	}
814 	return (0);
815 #else
816 	return (((uint64_t)map_buffer[index].length_high << 32) +
817 	    (uint64_t)map_buffer[index].length_low);
818 #endif
819 }
820 
821 static void
822 build_pcimemlists(void)
823 {
824 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
825 	uint64_t start;
826 	uint64_t end;
827 	int i, num;
828 
829 	/*
830 	 * initialize
831 	 */
832 	pcimemlists[0].addr = pci_lo_limit;
833 	pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
834 	pcimemlists_used = 1;
835 
836 	num = dboot_loader_mmap_entries();
837 	/*
838 	 * Fill in PCI memlists.
839 	 */
840 	for (i = 0; i < num; ++i) {
841 		start = dboot_loader_mmap_get_base(i);
842 		end = start + dboot_loader_mmap_get_length(i);
843 
844 		if (prom_debug)
845 			dboot_printf("\ttype: %d %" PRIx64 "..%"
846 			    PRIx64 "\n", dboot_loader_mmap_get_type(i),
847 			    start, end);
848 
849 		/*
850 		 * page align start and end
851 		 */
852 		start = (start + page_offset) & ~page_offset;
853 		end &= ~page_offset;
854 		if (end <= start)
855 			continue;
856 
857 		exclude_from_pci(start, end);
858 	}
859 
860 	/*
861 	 * Finish off the pcimemlist
862 	 */
863 	if (prom_debug) {
864 		for (i = 0; i < pcimemlists_used; ++i) {
865 			dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
866 			    PRIx64 "\n", pcimemlists[i].addr,
867 			    pcimemlists[i].addr + pcimemlists[i].size);
868 		}
869 	}
870 	pcimemlists[0].next = 0;
871 	pcimemlists[0].prev = 0;
872 	for (i = 1; i < pcimemlists_used; ++i) {
873 		pcimemlists[i].prev =
874 		    (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
875 		pcimemlists[i].next = 0;
876 		pcimemlists[i - 1].next =
877 		    (native_ptr_t)(uintptr_t)(pcimemlists + i);
878 	}
879 	bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
880 	DBG(bi->bi_pcimem);
881 }
882 
883 #if defined(__xpv)
884 /*
885  * Initialize memory allocator stuff from hypervisor-supplied start info.
886  */
887 static void
888 init_mem_alloc(void)
889 {
890 	int	local;	/* variables needed to find start region */
891 	paddr_t	scratch_start;
892 	xen_memory_map_t map;
893 
894 	DBG_MSG("Entered init_mem_alloc()\n");
895 
896 	/*
897 	 * Free memory follows the stack. There's at least 512KB of scratch
898 	 * space, rounded up to at least 2Mb alignment.  That should be enough
899 	 * for the page tables we'll need to build.  The nucleus memory is
900 	 * allocated last and will be outside the addressible range.  We'll
901 	 * switch to new page tables before we unpack the kernel
902 	 */
903 	scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
904 	DBG(scratch_start);
905 	scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
906 	DBG(scratch_end);
907 
908 	/*
909 	 * For paranoia, leave some space between hypervisor data and ours.
910 	 * Use 500 instead of 512.
911 	 */
912 	next_avail_addr = scratch_end - 500 * 1024;
913 	DBG(next_avail_addr);
914 
915 	/*
916 	 * The domain builder gives us at most 1 module
917 	 */
918 	DBG(xen_info->mod_len);
919 	if (xen_info->mod_len > 0) {
920 		DBG(xen_info->mod_start);
921 		modules[0].bm_addr =
922 		    (native_ptr_t)(uintptr_t)xen_info->mod_start;
923 		modules[0].bm_size = xen_info->mod_len;
924 		bi->bi_module_cnt = 1;
925 		bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
926 	} else {
927 		bi->bi_module_cnt = 0;
928 		bi->bi_modules = (native_ptr_t)(uintptr_t)NULL;
929 	}
930 	DBG(bi->bi_module_cnt);
931 	DBG(bi->bi_modules);
932 
933 	DBG(xen_info->mfn_list);
934 	DBG(xen_info->nr_pages);
935 	max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
936 	DBG(max_mem);
937 
938 	/*
939 	 * Using pseudo-physical addresses, so only 1 memlist element
940 	 */
941 	memlists[0].addr = 0;
942 	DBG(memlists[0].addr);
943 	memlists[0].size = max_mem;
944 	DBG(memlists[0].size);
945 	memlists_used = 1;
946 	DBG(memlists_used);
947 
948 	/*
949 	 * finish building physinstall list
950 	 */
951 	sort_physinstall();
952 
953 	/*
954 	 * build bios reserved memlists
955 	 */
956 	build_rsvdmemlists();
957 
958 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
959 		/*
960 		 * build PCI Memory list
961 		 */
962 		map.nr_entries = MAXMAPS;
963 		/*LINTED: constant in conditional context*/
964 		set_xen_guest_handle(map.buffer, map_buffer);
965 		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
966 			dboot_panic("getting XENMEM_machine_memory_map failed");
967 		build_pcimemlists();
968 	}
969 }
970 
971 #else	/* !__xpv */
972 
973 static void
974 dboot_multiboot1_xboot_consinfo(void)
975 {
976 	bi->bi_framebuffer = NULL;
977 }
978 
979 static void
980 dboot_multiboot2_xboot_consinfo(void)
981 {
982 	multiboot_tag_framebuffer_t *fb;
983 	fb = dboot_multiboot2_find_tag(mb2_info,
984 	    MULTIBOOT_TAG_TYPE_FRAMEBUFFER);
985 	bi->bi_framebuffer = (native_ptr_t)(uintptr_t)fb;
986 }
987 
988 static int
989 dboot_multiboot_modcount(void)
990 {
991 	switch (multiboot_version) {
992 	case 1:
993 		return (mb_info->mods_count);
994 
995 	case 2:
996 		return (dboot_multiboot2_modcount(mb2_info));
997 
998 	default:
999 		dboot_panic("Unknown multiboot version: %d\n",
1000 		    multiboot_version);
1001 		break;
1002 	}
1003 	return (0);
1004 }
1005 
1006 static uint32_t
1007 dboot_multiboot_modstart(int index)
1008 {
1009 	switch (multiboot_version) {
1010 	case 1:
1011 		return (((mb_module_t *)mb_info->mods_addr)[index].mod_start);
1012 
1013 	case 2:
1014 		return (dboot_multiboot2_modstart(mb2_info, index));
1015 
1016 	default:
1017 		dboot_panic("Unknown multiboot version: %d\n",
1018 		    multiboot_version);
1019 		break;
1020 	}
1021 	return (0);
1022 }
1023 
1024 static uint32_t
1025 dboot_multiboot_modend(int index)
1026 {
1027 	switch (multiboot_version) {
1028 	case 1:
1029 		return (((mb_module_t *)mb_info->mods_addr)[index].mod_end);
1030 
1031 	case 2:
1032 		return (dboot_multiboot2_modend(mb2_info, index));
1033 
1034 	default:
1035 		dboot_panic("Unknown multiboot version: %d\n",
1036 		    multiboot_version);
1037 		break;
1038 	}
1039 	return (0);
1040 }
1041 
1042 static char *
1043 dboot_multiboot_modcmdline(int index)
1044 {
1045 	switch (multiboot_version) {
1046 	case 1:
1047 		return ((char *)((mb_module_t *)
1048 		    mb_info->mods_addr)[index].mod_name);
1049 
1050 	case 2:
1051 		return (dboot_multiboot2_modcmdline(mb2_info, index));
1052 
1053 	default:
1054 		dboot_panic("Unknown multiboot version: %d\n",
1055 		    multiboot_version);
1056 		break;
1057 	}
1058 	return (0);
1059 }
1060 
1061 /*
1062  * Find the environment module for console setup.
1063  * Since we need the console to print early boot messages, the console is set up
1064  * before anything else and therefore we need to pick up the environment module
1065  * early too.
1066  *
1067  * Note, we just will search for and if found, will pass the env
1068  * module to console setup, the proper module list processing will happen later.
1069  */
1070 static void
1071 dboot_find_env(void)
1072 {
1073 	int i, modcount;
1074 	uint32_t mod_start, mod_end;
1075 	char *cmdline;
1076 
1077 	modcount = dboot_multiboot_modcount();
1078 
1079 	for (i = 0; i < modcount; ++i) {
1080 		cmdline = dboot_multiboot_modcmdline(i);
1081 		if (cmdline == NULL)
1082 			continue;
1083 
1084 		if (strstr(cmdline, "type=environment") == NULL)
1085 			continue;
1086 
1087 		mod_start = dboot_multiboot_modstart(i);
1088 		mod_end = dboot_multiboot_modend(i);
1089 		modules[0].bm_addr = (native_ptr_t)(uintptr_t)mod_start;
1090 		modules[0].bm_size = mod_end - mod_start;
1091 		modules[0].bm_name = (native_ptr_t)(uintptr_t)NULL;
1092 		modules[0].bm_hash = (native_ptr_t)(uintptr_t)NULL;
1093 		modules[0].bm_type = BMT_ENV;
1094 		bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1095 		bi->bi_module_cnt = 1;
1096 		return;
1097 	}
1098 }
1099 
1100 static boolean_t
1101 dboot_multiboot_basicmeminfo(uint32_t *lower, uint32_t *upper)
1102 {
1103 	boolean_t rv = B_FALSE;
1104 
1105 	switch (multiboot_version) {
1106 	case 1:
1107 		if (mb_info->flags & 0x01) {
1108 			*lower = mb_info->mem_lower;
1109 			*upper = mb_info->mem_upper;
1110 			rv = B_TRUE;
1111 		}
1112 		break;
1113 
1114 	case 2:
1115 		return (dboot_multiboot2_basicmeminfo(mb2_info, lower, upper));
1116 
1117 	default:
1118 		dboot_panic("Unknown multiboot version: %d\n",
1119 		    multiboot_version);
1120 		break;
1121 	}
1122 	return (rv);
1123 }
1124 
1125 static uint8_t
1126 dboot_a2h(char v)
1127 {
1128 	if (v >= 'a')
1129 		return (v - 'a' + 0xa);
1130 	else if (v >= 'A')
1131 		return (v - 'A' + 0xa);
1132 	else if (v >= '0')
1133 		return (v - '0');
1134 	else
1135 		dboot_panic("bad ASCII hex character %c\n", v);
1136 
1137 	return (0);
1138 }
1139 
1140 static void
1141 digest_a2h(const char *ascii, uint8_t *digest)
1142 {
1143 	unsigned int i;
1144 
1145 	for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1146 		digest[i] = dboot_a2h(ascii[i * 2]) << 4;
1147 		digest[i] |= dboot_a2h(ascii[i * 2 + 1]);
1148 	}
1149 }
1150 
1151 /*
1152  * Generate a SHA-1 hash of the first len bytes of image, and compare it with
1153  * the ASCII-format hash found in the 40-byte buffer at ascii.  If they
1154  * match, return 0, otherwise -1.  This works only for images smaller than
1155  * 4 GB, which should not be a problem.
1156  */
1157 static int
1158 check_image_hash(uint_t midx)
1159 {
1160 	const char *ascii;
1161 	const void *image;
1162 	size_t len;
1163 	SHA1_CTX ctx;
1164 	uint8_t digest[SHA1_DIGEST_LENGTH];
1165 	uint8_t baseline[SHA1_DIGEST_LENGTH];
1166 	unsigned int i;
1167 
1168 	ascii = (const char *)(uintptr_t)modules[midx].bm_hash;
1169 	image = (const void *)(uintptr_t)modules[midx].bm_addr;
1170 	len = (size_t)modules[midx].bm_size;
1171 
1172 	digest_a2h(ascii, baseline);
1173 
1174 	SHA1Init(&ctx);
1175 	SHA1Update(&ctx, image, len);
1176 	SHA1Final(digest, &ctx);
1177 
1178 	for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1179 		if (digest[i] != baseline[i])
1180 			return (-1);
1181 	}
1182 
1183 	return (0);
1184 }
1185 
1186 static const char *
1187 type_to_str(boot_module_type_t type)
1188 {
1189 	switch (type) {
1190 	case BMT_ROOTFS:
1191 		return ("rootfs");
1192 	case BMT_FILE:
1193 		return ("file");
1194 	case BMT_HASH:
1195 		return ("hash");
1196 	case BMT_ENV:
1197 		return ("environment");
1198 	default:
1199 		return ("unknown");
1200 	}
1201 }
1202 
1203 static void
1204 check_images(void)
1205 {
1206 	uint_t i;
1207 	char displayhash[SHA1_ASCII_LENGTH + 1];
1208 
1209 	for (i = 0; i < modules_used; i++) {
1210 		if (prom_debug) {
1211 			dboot_printf("module #%d: name %s type %s "
1212 			    "addr %lx size %lx\n",
1213 			    i, (char *)(uintptr_t)modules[i].bm_name,
1214 			    type_to_str(modules[i].bm_type),
1215 			    (ulong_t)modules[i].bm_addr,
1216 			    (ulong_t)modules[i].bm_size);
1217 		}
1218 
1219 		if (modules[i].bm_type == BMT_HASH ||
1220 		    modules[i].bm_hash == (native_ptr_t)(uintptr_t)NULL) {
1221 			DBG_MSG("module has no hash; skipping check\n");
1222 			continue;
1223 		}
1224 		(void) memcpy(displayhash,
1225 		    (void *)(uintptr_t)modules[i].bm_hash,
1226 		    SHA1_ASCII_LENGTH);
1227 		displayhash[SHA1_ASCII_LENGTH] = '\0';
1228 		if (prom_debug) {
1229 			dboot_printf("checking expected hash [%s]: ",
1230 			    displayhash);
1231 		}
1232 
1233 		if (check_image_hash(i) != 0)
1234 			dboot_panic("hash mismatch!\n");
1235 		else
1236 			DBG_MSG("OK\n");
1237 	}
1238 }
1239 
1240 /*
1241  * Determine the module's starting address, size, name, and type, and fill the
1242  * boot_modules structure.  This structure is used by the bop code, except for
1243  * hashes which are checked prior to transferring control to the kernel.
1244  */
1245 static void
1246 process_module(int midx)
1247 {
1248 	uint32_t mod_start = dboot_multiboot_modstart(midx);
1249 	uint32_t mod_end = dboot_multiboot_modend(midx);
1250 	char *cmdline = dboot_multiboot_modcmdline(midx);
1251 	char *p, *q;
1252 
1253 	check_higher(mod_end);
1254 	if (prom_debug) {
1255 		dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n",
1256 		    midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end);
1257 	}
1258 
1259 	if (mod_start > mod_end) {
1260 		dboot_panic("module #%d: module start address 0x%lx greater "
1261 		    "than end address 0x%lx", midx,
1262 		    (ulong_t)mod_start, (ulong_t)mod_end);
1263 	}
1264 
1265 	/*
1266 	 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
1267 	 * the address of the last valid byte in a module plus 1 as mod_end.
1268 	 * This is of course a bug; the multiboot specification simply states
1269 	 * that mod_start and mod_end "contain the start and end addresses of
1270 	 * the boot module itself" which is pretty obviously not what GRUB is
1271 	 * doing.  However, fixing it requires that not only this code be
1272 	 * changed but also that other code consuming this value and values
1273 	 * derived from it be fixed, and that the kernel and GRUB must either
1274 	 * both have the bug or neither.  While there are a lot of combinations
1275 	 * that will work, there are also some that won't, so for simplicity
1276 	 * we'll just cope with the bug.  That means we won't actually hash the
1277 	 * byte at mod_end, and we will expect that mod_end for the hash file
1278 	 * itself is one greater than some multiple of 41 (40 bytes of ASCII
1279 	 * hash plus a newline for each module).  We set bm_size to the true
1280 	 * correct number of bytes in each module, achieving exactly this.
1281 	 */
1282 
1283 	modules[midx].bm_addr = (native_ptr_t)(uintptr_t)mod_start;
1284 	modules[midx].bm_size = mod_end - mod_start;
1285 	modules[midx].bm_name = (native_ptr_t)(uintptr_t)cmdline;
1286 	modules[midx].bm_hash = (native_ptr_t)(uintptr_t)NULL;
1287 	modules[midx].bm_type = BMT_FILE;
1288 
1289 	if (cmdline == NULL) {
1290 		modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname;
1291 		return;
1292 	}
1293 
1294 	p = cmdline;
1295 	modules[midx].bm_name =
1296 	    (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r");
1297 
1298 	while (p != NULL) {
1299 		q = strsep(&p, " \t\f\n\r");
1300 		if (strncmp(q, "name=", 5) == 0) {
1301 			if (q[5] != '\0' && !isspace(q[5])) {
1302 				modules[midx].bm_name =
1303 				    (native_ptr_t)(uintptr_t)(q + 5);
1304 			}
1305 			continue;
1306 		}
1307 
1308 		if (strncmp(q, "type=", 5) == 0) {
1309 			if (q[5] == '\0' || isspace(q[5]))
1310 				continue;
1311 			q += 5;
1312 			if (strcmp(q, "rootfs") == 0) {
1313 				modules[midx].bm_type = BMT_ROOTFS;
1314 			} else if (strcmp(q, "hash") == 0) {
1315 				modules[midx].bm_type = BMT_HASH;
1316 			} else if (strcmp(q, "environment") == 0) {
1317 				modules[midx].bm_type = BMT_ENV;
1318 			} else if (strcmp(q, "file") != 0) {
1319 				dboot_printf("\tmodule #%d: unknown module "
1320 				    "type '%s'; defaulting to 'file'",
1321 				    midx, q);
1322 			}
1323 			continue;
1324 		}
1325 
1326 		if (strncmp(q, "hash=", 5) == 0) {
1327 			if (q[5] != '\0' && !isspace(q[5])) {
1328 				modules[midx].bm_hash =
1329 				    (native_ptr_t)(uintptr_t)(q + 5);
1330 			}
1331 			continue;
1332 		}
1333 
1334 		dboot_printf("ignoring unknown option '%s'\n", q);
1335 	}
1336 }
1337 
1338 /*
1339  * Backward compatibility: if there are exactly one or two modules, both
1340  * of type 'file' and neither with an embedded hash value, we have been
1341  * given the legacy style modules.  In this case we need to treat the first
1342  * module as a rootfs and the second as a hash referencing that module.
1343  * Otherwise, even if the configuration is invalid, we assume that the
1344  * operator knows what he's doing or at least isn't being bitten by this
1345  * interface change.
1346  */
1347 static void
1348 fixup_modules(void)
1349 {
1350 	if (modules_used == 0 || modules_used > 2)
1351 		return;
1352 
1353 	if (modules[0].bm_type != BMT_FILE ||
1354 	    modules_used > 1 && modules[1].bm_type != BMT_FILE) {
1355 		return;
1356 	}
1357 
1358 	if (modules[0].bm_hash != (native_ptr_t)(uintptr_t)NULL ||
1359 	    modules_used > 1 &&
1360 	    modules[1].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1361 		return;
1362 	}
1363 
1364 	modules[0].bm_type = BMT_ROOTFS;
1365 	if (modules_used > 1) {
1366 		modules[1].bm_type = BMT_HASH;
1367 		modules[1].bm_name = modules[0].bm_name;
1368 	}
1369 }
1370 
1371 /*
1372  * For modules that do not have assigned hashes but have a separate hash module,
1373  * find the assigned hash module and set the primary module's bm_hash to point
1374  * to the hash data from that module.  We will then ignore modules of type
1375  * BMT_HASH from this point forward.
1376  */
1377 static void
1378 assign_module_hashes(void)
1379 {
1380 	uint_t i, j;
1381 
1382 	for (i = 0; i < modules_used; i++) {
1383 		if (modules[i].bm_type == BMT_HASH ||
1384 		    modules[i].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1385 			continue;
1386 		}
1387 
1388 		for (j = 0; j < modules_used; j++) {
1389 			if (modules[j].bm_type != BMT_HASH ||
1390 			    strcmp((char *)(uintptr_t)modules[j].bm_name,
1391 			    (char *)(uintptr_t)modules[i].bm_name) != 0) {
1392 				continue;
1393 			}
1394 
1395 			if (modules[j].bm_size < SHA1_ASCII_LENGTH) {
1396 				dboot_printf("Short hash module of length "
1397 				    "0x%lx bytes; ignoring\n",
1398 				    (ulong_t)modules[j].bm_size);
1399 			} else {
1400 				modules[i].bm_hash = modules[j].bm_addr;
1401 			}
1402 			break;
1403 		}
1404 	}
1405 }
1406 
1407 /*
1408  * Walk through the module information finding the last used address.
1409  * The first available address will become the top level page table.
1410  */
1411 static void
1412 dboot_process_modules(void)
1413 {
1414 	int i, modcount;
1415 	extern char _end[];
1416 
1417 	DBG_MSG("\nFinding Modules\n");
1418 	modcount = dboot_multiboot_modcount();
1419 	if (modcount > MAX_BOOT_MODULES) {
1420 		dboot_panic("Too many modules (%d) -- the maximum is %d.",
1421 		    modcount, MAX_BOOT_MODULES);
1422 	}
1423 	/*
1424 	 * search the modules to find the last used address
1425 	 * we'll build the module list while we're walking through here
1426 	 */
1427 	check_higher((paddr_t)(uintptr_t)&_end);
1428 	for (i = 0; i < modcount; ++i) {
1429 		process_module(i);
1430 		modules_used++;
1431 	}
1432 	bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1433 	DBG(bi->bi_modules);
1434 	bi->bi_module_cnt = modcount;
1435 	DBG(bi->bi_module_cnt);
1436 
1437 	fixup_modules();
1438 	assign_module_hashes();
1439 	check_images();
1440 }
1441 
1442 /*
1443  * We then build the phys_install memlist from the multiboot information.
1444  */
1445 static void
1446 dboot_process_mmap(void)
1447 {
1448 	uint64_t start;
1449 	uint64_t end;
1450 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
1451 	uint32_t lower, upper;
1452 	int i, mmap_entries;
1453 
1454 	/*
1455 	 * Walk through the memory map from multiboot and build our memlist
1456 	 * structures. Note these will have native format pointers.
1457 	 */
1458 	DBG_MSG("\nFinding Memory Map\n");
1459 	num_entries = 0;
1460 	num_entries_set = B_FALSE;
1461 	max_mem = 0;
1462 	if ((mmap_entries = dboot_loader_mmap_entries()) > 0) {
1463 		for (i = 0; i < mmap_entries; i++) {
1464 			uint32_t type = dboot_loader_mmap_get_type(i);
1465 			start = dboot_loader_mmap_get_base(i);
1466 			end = start + dboot_loader_mmap_get_length(i);
1467 
1468 			if (prom_debug)
1469 				dboot_printf("\ttype: %d %" PRIx64 "..%"
1470 				    PRIx64 "\n", type, start, end);
1471 
1472 			/*
1473 			 * page align start and end
1474 			 */
1475 			start = (start + page_offset) & ~page_offset;
1476 			end &= ~page_offset;
1477 			if (end <= start)
1478 				continue;
1479 
1480 			/*
1481 			 * only type 1 is usable RAM
1482 			 */
1483 			switch (type) {
1484 			case 1:
1485 				if (end > max_mem)
1486 					max_mem = end;
1487 				memlists[memlists_used].addr = start;
1488 				memlists[memlists_used].size = end - start;
1489 				++memlists_used;
1490 				if (memlists_used > MAX_MEMLIST)
1491 					dboot_panic("too many memlists");
1492 				break;
1493 			case 2:
1494 				rsvdmemlists[rsvdmemlists_used].addr = start;
1495 				rsvdmemlists[rsvdmemlists_used].size =
1496 				    end - start;
1497 				++rsvdmemlists_used;
1498 				if (rsvdmemlists_used > MAX_MEMLIST)
1499 					dboot_panic("too many rsvdmemlists");
1500 				break;
1501 			default:
1502 				continue;
1503 			}
1504 		}
1505 		build_pcimemlists();
1506 	} else if (dboot_multiboot_basicmeminfo(&lower, &upper)) {
1507 		DBG(lower);
1508 		memlists[memlists_used].addr = 0;
1509 		memlists[memlists_used].size = lower * 1024;
1510 		++memlists_used;
1511 		DBG(upper);
1512 		memlists[memlists_used].addr = 1024 * 1024;
1513 		memlists[memlists_used].size = upper * 1024;
1514 		++memlists_used;
1515 
1516 		/*
1517 		 * Old platform - assume I/O space at the end of memory.
1518 		 */
1519 		pcimemlists[0].addr = (upper * 1024) + (1024 * 1024);
1520 		pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
1521 		pcimemlists[0].next = 0;
1522 		pcimemlists[0].prev = 0;
1523 		bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
1524 		DBG(bi->bi_pcimem);
1525 	} else {
1526 		dboot_panic("No memory info from boot loader!!!");
1527 	}
1528 
1529 	/*
1530 	 * finish processing the physinstall list
1531 	 */
1532 	sort_physinstall();
1533 
1534 	/*
1535 	 * build bios reserved mem lists
1536 	 */
1537 	build_rsvdmemlists();
1538 }
1539 
1540 /*
1541  * The highest address is used as the starting point for dboot's simple
1542  * memory allocator.
1543  *
1544  * Finding the highest address in case of Multiboot 1 protocol is
1545  * quite painful in the sense that some information provided by
1546  * the multiboot info structure points to BIOS data, and some to RAM.
1547  *
1548  * The module list was processed and checked already by dboot_process_modules(),
1549  * so we will check the command line string and the memory map.
1550  *
1551  * This list of to be checked items is based on our current knowledge of
1552  * allocations made by grub1 and will need to be reviewed if there
1553  * are updates about the information provided by Multiboot 1.
1554  *
1555  * In the case of the Multiboot 2, our life is much simpler, as the MB2
1556  * information tag list is one contiguous chunk of memory.
1557  */
1558 static paddr_t
1559 dboot_multiboot1_highest_addr(void)
1560 {
1561 	paddr_t addr = (paddr_t)(uintptr_t)NULL;
1562 	char *cmdl = (char *)mb_info->cmdline;
1563 
1564 	if (mb_info->flags & MB_INFO_CMDLINE)
1565 		addr = ((paddr_t)((uintptr_t)cmdl + strlen(cmdl) + 1));
1566 
1567 	if (mb_info->flags & MB_INFO_MEM_MAP)
1568 		addr = MAX(addr,
1569 		    ((paddr_t)(mb_info->mmap_addr + mb_info->mmap_length)));
1570 	return (addr);
1571 }
1572 
1573 static void
1574 dboot_multiboot_highest_addr(void)
1575 {
1576 	paddr_t addr;
1577 
1578 	switch (multiboot_version) {
1579 	case 1:
1580 		addr = dboot_multiboot1_highest_addr();
1581 		if (addr != (paddr_t)(uintptr_t)NULL)
1582 			check_higher(addr);
1583 		break;
1584 	case 2:
1585 		addr = dboot_multiboot2_highest_addr(mb2_info);
1586 		if (addr != (paddr_t)(uintptr_t)NULL)
1587 			check_higher(addr);
1588 		break;
1589 	default:
1590 		dboot_panic("Unknown multiboot version: %d\n",
1591 		    multiboot_version);
1592 		break;
1593 	}
1594 }
1595 
1596 /*
1597  * Walk the boot loader provided information and find the highest free address.
1598  */
1599 static void
1600 init_mem_alloc(void)
1601 {
1602 	DBG_MSG("Entered init_mem_alloc()\n");
1603 	dboot_process_modules();
1604 	dboot_process_mmap();
1605 	dboot_multiboot_highest_addr();
1606 }
1607 
1608 static int
1609 dboot_same_guids(efi_guid_t *g1, efi_guid_t *g2)
1610 {
1611 	int i;
1612 
1613 	if (g1->time_low != g2->time_low)
1614 		return (0);
1615 	if (g1->time_mid != g2->time_mid)
1616 		return (0);
1617 	if (g1->time_hi_and_version != g2->time_hi_and_version)
1618 		return (0);
1619 	if (g1->clock_seq_hi_and_reserved != g2->clock_seq_hi_and_reserved)
1620 		return (0);
1621 	if (g1->clock_seq_low != g2->clock_seq_low)
1622 		return (0);
1623 
1624 	for (i = 0; i < 6; i++) {
1625 		if (g1->node_addr[i] != g2->node_addr[i])
1626 			return (0);
1627 	}
1628 	return (1);
1629 }
1630 
1631 static void
1632 process_efi32(EFI_SYSTEM_TABLE32 *efi)
1633 {
1634 	uint32_t entries;
1635 	EFI_CONFIGURATION_TABLE32 *config;
1636 	int i;
1637 
1638 	entries = efi->NumberOfTableEntries;
1639 	config = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1640 	    efi->ConfigurationTable;
1641 
1642 	for (i = 0; i < entries; i++) {
1643 		if (dboot_same_guids(&config[i].VendorGuid, &smbios3)) {
1644 			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1645 			    config[i].VendorTable;
1646 		}
1647 		if (bi->bi_smbios == NULL &&
1648 		    dboot_same_guids(&config[i].VendorGuid, &smbios)) {
1649 			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1650 			    config[i].VendorTable;
1651 		}
1652 		if (dboot_same_guids(&config[i].VendorGuid, &acpi2)) {
1653 			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1654 			    config[i].VendorTable;
1655 		}
1656 		if (bi->bi_acpi_rsdp == NULL &&
1657 		    dboot_same_guids(&config[i].VendorGuid, &acpi1)) {
1658 			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1659 			    config[i].VendorTable;
1660 		}
1661 	}
1662 }
1663 
1664 static void
1665 process_efi64(EFI_SYSTEM_TABLE64 *efi)
1666 {
1667 	uint64_t entries;
1668 	EFI_CONFIGURATION_TABLE64 *config;
1669 	int i;
1670 
1671 	entries = efi->NumberOfTableEntries;
1672 	config = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1673 	    efi->ConfigurationTable;
1674 
1675 	for (i = 0; i < entries; i++) {
1676 		if (dboot_same_guids(&config[i].VendorGuid, &smbios3)) {
1677 			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1678 			    config[i].VendorTable;
1679 		}
1680 		if (bi->bi_smbios == NULL &&
1681 		    dboot_same_guids(&config[i].VendorGuid, &smbios)) {
1682 			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1683 			    config[i].VendorTable;
1684 		}
1685 		/* Prefer acpi v2+ over v1. */
1686 		if (dboot_same_guids(&config[i].VendorGuid, &acpi2)) {
1687 			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1688 			    config[i].VendorTable;
1689 		}
1690 		if (bi->bi_acpi_rsdp == NULL &&
1691 		    dboot_same_guids(&config[i].VendorGuid, &acpi1)) {
1692 			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1693 			    config[i].VendorTable;
1694 		}
1695 	}
1696 }
1697 
1698 static void
1699 dboot_multiboot_get_fwtables(void)
1700 {
1701 	multiboot_tag_new_acpi_t *nacpitagp;
1702 	multiboot_tag_old_acpi_t *oacpitagp;
1703 	multiboot_tag_efi64_t *efi64tagp = NULL;
1704 	multiboot_tag_efi32_t *efi32tagp = NULL;
1705 
1706 	/* no fw tables from multiboot 1 */
1707 	if (multiboot_version != 2)
1708 		return;
1709 
1710 	efi64tagp = (multiboot_tag_efi64_t *)
1711 	    dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_EFI64);
1712 	if (efi64tagp != NULL) {
1713 		bi->bi_uefi_arch = XBI_UEFI_ARCH_64;
1714 		bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1715 		    efi64tagp->mb_pointer;
1716 		process_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
1717 		    efi64tagp->mb_pointer);
1718 	} else {
1719 		efi32tagp = (multiboot_tag_efi32_t *)
1720 		    dboot_multiboot2_find_tag(mb2_info,
1721 		    MULTIBOOT_TAG_TYPE_EFI32);
1722 		if (efi32tagp != NULL) {
1723 			bi->bi_uefi_arch = XBI_UEFI_ARCH_32;
1724 			bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1725 			    efi32tagp->mb_pointer;
1726 			process_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
1727 			    efi32tagp->mb_pointer);
1728 		}
1729 	}
1730 
1731 	/*
1732 	 * The ACPI RSDP can be found by scanning the BIOS memory areas or
1733 	 * from the EFI system table. The boot loader may pass in the address
1734 	 * it found the ACPI tables at.
1735 	 */
1736 	nacpitagp = (multiboot_tag_new_acpi_t *)
1737 	    dboot_multiboot2_find_tag(mb2_info,
1738 	    MULTIBOOT_TAG_TYPE_ACPI_NEW);
1739 	oacpitagp = (multiboot_tag_old_acpi_t *)
1740 	    dboot_multiboot2_find_tag(mb2_info,
1741 	    MULTIBOOT_TAG_TYPE_ACPI_OLD);
1742 
1743 	if (nacpitagp != NULL) {
1744 		bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1745 		    &nacpitagp->mb_rsdp[0];
1746 	} else if (oacpitagp != NULL) {
1747 		bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1748 		    &oacpitagp->mb_rsdp[0];
1749 	}
1750 }
1751 
1752 /* print out EFI version string with newline */
1753 static void
1754 dboot_print_efi_version(uint32_t ver)
1755 {
1756 	int rev;
1757 
1758 	dboot_printf("%d.", EFI_REV_MAJOR(ver));
1759 
1760 	rev = EFI_REV_MINOR(ver);
1761 	if ((rev % 10) != 0) {
1762 		dboot_printf("%d.%d\n", rev / 10, rev % 10);
1763 	} else {
1764 		dboot_printf("%d\n", rev / 10);
1765 	}
1766 }
1767 
1768 static void
1769 print_efi32(EFI_SYSTEM_TABLE32 *efi)
1770 {
1771 	uint16_t *data;
1772 	EFI_CONFIGURATION_TABLE32 *conf;
1773 	int i;
1774 
1775 	dboot_printf("EFI32 signature: %llx\n",
1776 	    (unsigned long long)efi->Hdr.Signature);
1777 	dboot_printf("EFI system version: ");
1778 	dboot_print_efi_version(efi->Hdr.Revision);
1779 	dboot_printf("EFI system vendor: ");
1780 	data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1781 	for (i = 0; data[i] != 0; i++)
1782 		dboot_printf("%c", (char)data[i]);
1783 	dboot_printf("\nEFI firmware revision: ");
1784 	dboot_print_efi_version(efi->FirmwareRevision);
1785 	dboot_printf("EFI system table number of entries: %d\n",
1786 	    efi->NumberOfTableEntries);
1787 	conf = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1788 	    efi->ConfigurationTable;
1789 	for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1790 		dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1791 		    conf[i].VendorGuid.time_low,
1792 		    conf[i].VendorGuid.time_mid,
1793 		    conf[i].VendorGuid.time_hi_and_version,
1794 		    conf[i].VendorGuid.clock_seq_hi_and_reserved,
1795 		    conf[i].VendorGuid.clock_seq_low);
1796 		dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1797 		    conf[i].VendorGuid.node_addr[0],
1798 		    conf[i].VendorGuid.node_addr[1],
1799 		    conf[i].VendorGuid.node_addr[2],
1800 		    conf[i].VendorGuid.node_addr[3],
1801 		    conf[i].VendorGuid.node_addr[4],
1802 		    conf[i].VendorGuid.node_addr[5]);
1803 	}
1804 }
1805 
1806 static void
1807 print_efi64(EFI_SYSTEM_TABLE64 *efi)
1808 {
1809 	uint16_t *data;
1810 	EFI_CONFIGURATION_TABLE64 *conf;
1811 	int i;
1812 
1813 	dboot_printf("EFI64 signature: %llx\n",
1814 	    (unsigned long long)efi->Hdr.Signature);
1815 	dboot_printf("EFI system version: ");
1816 	dboot_print_efi_version(efi->Hdr.Revision);
1817 	dboot_printf("EFI system vendor: ");
1818 	data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1819 	for (i = 0; data[i] != 0; i++)
1820 		dboot_printf("%c", (char)data[i]);
1821 	dboot_printf("\nEFI firmware revision: ");
1822 	dboot_print_efi_version(efi->FirmwareRevision);
1823 	dboot_printf("EFI system table number of entries: %lld\n",
1824 	    efi->NumberOfTableEntries);
1825 	conf = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1826 	    efi->ConfigurationTable;
1827 	for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1828 		dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1829 		    conf[i].VendorGuid.time_low,
1830 		    conf[i].VendorGuid.time_mid,
1831 		    conf[i].VendorGuid.time_hi_and_version,
1832 		    conf[i].VendorGuid.clock_seq_hi_and_reserved,
1833 		    conf[i].VendorGuid.clock_seq_low);
1834 		dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1835 		    conf[i].VendorGuid.node_addr[0],
1836 		    conf[i].VendorGuid.node_addr[1],
1837 		    conf[i].VendorGuid.node_addr[2],
1838 		    conf[i].VendorGuid.node_addr[3],
1839 		    conf[i].VendorGuid.node_addr[4],
1840 		    conf[i].VendorGuid.node_addr[5]);
1841 	}
1842 }
1843 #endif /* !__xpv */
1844 
1845 /*
1846  * Simple memory allocator, allocates aligned physical memory.
1847  * Note that startup_kernel() only allocates memory, never frees.
1848  * Memory usage just grows in an upward direction.
1849  */
1850 static void *
1851 do_mem_alloc(uint32_t size, uint32_t align)
1852 {
1853 	uint_t i;
1854 	uint64_t best;
1855 	uint64_t start;
1856 	uint64_t end;
1857 
1858 	/*
1859 	 * make sure size is a multiple of pagesize
1860 	 */
1861 	size = RNDUP(size, MMU_PAGESIZE);
1862 	next_avail_addr = RNDUP(next_avail_addr, align);
1863 
1864 	/*
1865 	 * XXPV fixme joe
1866 	 *
1867 	 * a really large bootarchive that causes you to run out of memory
1868 	 * may cause this to blow up
1869 	 */
1870 	/* LINTED E_UNEXPECTED_UINT_PROMOTION */
1871 	best = (uint64_t)-size;
1872 	for (i = 0; i < memlists_used; ++i) {
1873 		start = memlists[i].addr;
1874 #if defined(__xpv)
1875 		start += mfn_base;
1876 #endif
1877 		end = start + memlists[i].size;
1878 
1879 		/*
1880 		 * did we find the desired address?
1881 		 */
1882 		if (start <= next_avail_addr && next_avail_addr + size <= end) {
1883 			best = next_avail_addr;
1884 			goto done;
1885 		}
1886 
1887 		/*
1888 		 * if not is this address the best so far?
1889 		 */
1890 		if (start > next_avail_addr && start < best &&
1891 		    RNDUP(start, align) + size <= end)
1892 			best = RNDUP(start, align);
1893 	}
1894 
1895 	/*
1896 	 * We didn't find exactly the address we wanted, due to going off the
1897 	 * end of a memory region. Return the best found memory address.
1898 	 */
1899 done:
1900 	next_avail_addr = best + size;
1901 #if defined(__xpv)
1902 	if (next_avail_addr > scratch_end)
1903 		dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
1904 		    "0x%lx", (ulong_t)next_avail_addr,
1905 		    (ulong_t)scratch_end);
1906 #endif
1907 	(void) memset((void *)(uintptr_t)best, 0, size);
1908 	return ((void *)(uintptr_t)best);
1909 }
1910 
1911 void *
1912 mem_alloc(uint32_t size)
1913 {
1914 	return (do_mem_alloc(size, MMU_PAGESIZE));
1915 }
1916 
1917 
1918 /*
1919  * Build page tables to map all of memory used so far as well as the kernel.
1920  */
1921 static void
1922 build_page_tables(void)
1923 {
1924 	uint32_t psize;
1925 	uint32_t level;
1926 	uint32_t off;
1927 	uint64_t start;
1928 #if !defined(__xpv)
1929 	uint32_t i;
1930 	uint64_t end;
1931 #endif	/* __xpv */
1932 
1933 	/*
1934 	 * If we're on metal, we need to create the top level pagetable.
1935 	 */
1936 #if defined(__xpv)
1937 	top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1938 #else /* __xpv */
1939 	top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1940 #endif /* __xpv */
1941 	DBG((uintptr_t)top_page_table);
1942 
1943 	/*
1944 	 * Determine if we'll use large mappings for kernel, then map it.
1945 	 */
1946 	if (largepage_support) {
1947 		psize = lpagesize;
1948 		level = 1;
1949 	} else {
1950 		psize = MMU_PAGESIZE;
1951 		level = 0;
1952 	}
1953 
1954 	DBG_MSG("Mapping kernel\n");
1955 	DBG(ktext_phys);
1956 	DBG(target_kernel_text);
1957 	DBG(ksize);
1958 	DBG(psize);
1959 	for (off = 0; off < ksize; off += psize)
1960 		map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1961 
1962 	/*
1963 	 * The kernel will need a 1 page window to work with page tables
1964 	 */
1965 	bi->bi_pt_window = (native_ptr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1966 	DBG(bi->bi_pt_window);
1967 	bi->bi_pte_to_pt_window =
1968 	    (native_ptr_t)(uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1969 	DBG(bi->bi_pte_to_pt_window);
1970 
1971 #if defined(__xpv)
1972 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1973 		/* If this is a domU we're done. */
1974 		DBG_MSG("\nPage tables constructed\n");
1975 		return;
1976 	}
1977 #endif /* __xpv */
1978 
1979 	/*
1980 	 * We need 1:1 mappings for the lower 1M of memory to access
1981 	 * BIOS tables used by a couple of drivers during boot.
1982 	 *
1983 	 * The following code works because our simple memory allocator
1984 	 * only grows usage in an upwards direction.
1985 	 *
1986 	 * Note that by this point in boot some mappings for low memory
1987 	 * may already exist because we've already accessed device in low
1988 	 * memory.  (Specifically the video frame buffer and keyboard
1989 	 * status ports.)  If we're booting on raw hardware then GRUB
1990 	 * created these mappings for us.  If we're booting under a
1991 	 * hypervisor then we went ahead and remapped these devices into
1992 	 * memory allocated within dboot itself.
1993 	 */
1994 	if (map_debug)
1995 		dboot_printf("1:1 map pa=0..1Meg\n");
1996 	for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
1997 #if defined(__xpv)
1998 		map_ma_at_va(start, start, 0);
1999 #else /* __xpv */
2000 		map_pa_at_va(start, start, 0);
2001 #endif /* __xpv */
2002 	}
2003 
2004 #if !defined(__xpv)
2005 
2006 	for (i = 0; i < memlists_used; ++i) {
2007 		start = memlists[i].addr;
2008 		end = start + memlists[i].size;
2009 
2010 		if (map_debug)
2011 			dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
2012 			    start, end);
2013 		while (start < end && start < next_avail_addr) {
2014 			map_pa_at_va(start, start, 0);
2015 			start += MMU_PAGESIZE;
2016 		}
2017 		if (start >= next_avail_addr)
2018 			break;
2019 	}
2020 
2021 	/*
2022 	 * Map framebuffer memory as PT_NOCACHE as this is memory from a
2023 	 * device and therefore must not be cached.
2024 	 */
2025 	if (bi->bi_framebuffer != NULL) {
2026 		multiboot_tag_framebuffer_t *fb;
2027 		fb = (multiboot_tag_framebuffer_t *)(uintptr_t)
2028 		    bi->bi_framebuffer;
2029 
2030 		start = fb->framebuffer_common.framebuffer_addr;
2031 		end = start + fb->framebuffer_common.framebuffer_height *
2032 		    fb->framebuffer_common.framebuffer_pitch;
2033 
2034 		pte_bits |= PT_NOCACHE;
2035 		while (start < end) {
2036 			map_pa_at_va(start, start, 0);
2037 			start += MMU_PAGESIZE;
2038 		}
2039 		pte_bits &= ~PT_NOCACHE;
2040 	}
2041 #endif /* !__xpv */
2042 
2043 	DBG_MSG("\nPage tables constructed\n");
2044 }
2045 
2046 #define	NO_MULTIBOOT	\
2047 "multiboot is no longer used to boot the Solaris Operating System.\n\
2048 The grub entry should be changed to:\n\
2049 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
2050 module$ /platform/i86pc/$ISADIR/boot_archive\n\
2051 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
2052 
2053 static void
2054 dboot_init_xboot_consinfo(void)
2055 {
2056 	uintptr_t addr;
2057 	/*
2058 	 * boot info must be 16 byte aligned for 64 bit kernel ABI
2059 	 */
2060 	addr = (uintptr_t)boot_info;
2061 	addr = (addr + 0xf) & ~0xf;
2062 	bi = (struct xboot_info *)addr;
2063 
2064 #if !defined(__xpv)
2065 	switch (multiboot_version) {
2066 	case 1:
2067 		dboot_multiboot1_xboot_consinfo();
2068 		break;
2069 	case 2:
2070 		dboot_multiboot2_xboot_consinfo();
2071 		break;
2072 	default:
2073 		dboot_panic("Unknown multiboot version: %d\n",
2074 		    multiboot_version);
2075 		break;
2076 	}
2077 	/*
2078 	 * Lookup environment module for the console. Complete module list
2079 	 * will be built after console setup.
2080 	 */
2081 	dboot_find_env();
2082 #endif
2083 }
2084 
2085 /*
2086  * Set up basic data from the boot loader.
2087  * The load_addr is part of AOUT kludge setup in dboot_grub.s, to support
2088  * 32-bit dboot code setup used to set up and start 64-bit kernel.
2089  * AOUT kludge does allow 32-bit boot loader, such as grub1, to load and
2090  * start 64-bit illumos kernel.
2091  */
2092 static void
2093 dboot_loader_init(void)
2094 {
2095 #if !defined(__xpv)
2096 	mb_info = NULL;
2097 	mb2_info = NULL;
2098 
2099 	switch (mb_magic) {
2100 	case MB_BOOTLOADER_MAGIC:
2101 		multiboot_version = 1;
2102 		mb_info = (multiboot_info_t *)(uintptr_t)mb_addr;
2103 #if defined(_BOOT_TARGET_amd64)
2104 		load_addr = mb_header.load_addr;
2105 #endif
2106 		break;
2107 
2108 	case MULTIBOOT2_BOOTLOADER_MAGIC:
2109 		multiboot_version = 2;
2110 		mb2_info = (multiboot2_info_header_t *)(uintptr_t)mb_addr;
2111 		mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mb2_info);
2112 #if defined(_BOOT_TARGET_amd64)
2113 		load_addr = mb2_load_addr;
2114 #endif
2115 		break;
2116 
2117 	default:
2118 		dboot_panic("Unknown bootloader magic: 0x%x\n", mb_magic);
2119 		break;
2120 	}
2121 #endif	/* !defined(__xpv) */
2122 }
2123 
2124 /* Extract the kernel command line from [multi]boot information. */
2125 static char *
2126 dboot_loader_cmdline(void)
2127 {
2128 	char *line = NULL;
2129 
2130 #if defined(__xpv)
2131 	line = (char *)xen_info->cmd_line;
2132 #else /* __xpv */
2133 
2134 	switch (multiboot_version) {
2135 	case 1:
2136 		if (mb_info->flags & MB_INFO_CMDLINE)
2137 			line = (char *)mb_info->cmdline;
2138 		break;
2139 
2140 	case 2:
2141 		line = dboot_multiboot2_cmdline(mb2_info);
2142 		break;
2143 
2144 	default:
2145 		dboot_panic("Unknown multiboot version: %d\n",
2146 		    multiboot_version);
2147 		break;
2148 	}
2149 
2150 #endif /* __xpv */
2151 
2152 	/*
2153 	 * Make sure we have valid pointer so the string operations
2154 	 * will not crash us.
2155 	 */
2156 	if (line == NULL)
2157 		line = "";
2158 
2159 	return (line);
2160 }
2161 
2162 static char *
2163 dboot_loader_name(void)
2164 {
2165 #if defined(__xpv)
2166 	return (NULL);
2167 #else /* __xpv */
2168 	multiboot_tag_string_t *tag;
2169 
2170 	switch (multiboot_version) {
2171 	case 1:
2172 		return ((char *)mb_info->boot_loader_name);
2173 
2174 	case 2:
2175 		tag = dboot_multiboot2_find_tag(mb2_info,
2176 		    MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME);
2177 		return (tag->mb_string);
2178 	default:
2179 		dboot_panic("Unknown multiboot version: %d\n",
2180 		    multiboot_version);
2181 		break;
2182 	}
2183 
2184 	return (NULL);
2185 #endif /* __xpv */
2186 }
2187 
2188 /*
2189  * startup_kernel has a pretty simple job. It builds pagetables which reflect
2190  * 1:1 mappings for all memory in use. It then also adds mappings for
2191  * the kernel nucleus at virtual address of target_kernel_text using large page
2192  * mappings. The page table pages are also accessible at 1:1 mapped
2193  * virtual addresses.
2194  */
2195 /*ARGSUSED*/
2196 void
2197 startup_kernel(void)
2198 {
2199 	char *cmdline;
2200 	char *bootloader;
2201 #if defined(__xpv)
2202 	physdev_set_iopl_t set_iopl;
2203 #endif /* __xpv */
2204 
2205 	dboot_loader_init();
2206 	/*
2207 	 * At this point we are executing in a 32 bit real mode.
2208 	 */
2209 
2210 	bootloader = dboot_loader_name();
2211 	cmdline = dboot_loader_cmdline();
2212 
2213 #if defined(__xpv)
2214 	/*
2215 	 * For dom0, before we initialize the console subsystem we'll
2216 	 * need to enable io operations, so set I/O priveldge level to 1.
2217 	 */
2218 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
2219 		set_iopl.iopl = 1;
2220 		(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
2221 	}
2222 #endif /* __xpv */
2223 
2224 	dboot_init_xboot_consinfo();
2225 	bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
2226 	bcons_init(bi);
2227 
2228 	prom_debug = (find_boot_prop("prom_debug") != NULL);
2229 	map_debug = (find_boot_prop("map_debug") != NULL);
2230 
2231 #if !defined(__xpv)
2232 	dboot_multiboot_get_fwtables();
2233 #endif
2234 	DBG_MSG("\n\nillumos prekernel set: ");
2235 	DBG_MSG(cmdline);
2236 	DBG_MSG("\n");
2237 
2238 	if (bootloader != NULL && prom_debug) {
2239 		dboot_printf("Kernel loaded by: %s\n", bootloader);
2240 #if !defined(__xpv)
2241 		dboot_printf("Using multiboot %d boot protocol.\n",
2242 		    multiboot_version);
2243 #endif
2244 	}
2245 
2246 	if (strstr(cmdline, "multiboot") != NULL) {
2247 		dboot_panic(NO_MULTIBOOT);
2248 	}
2249 
2250 	DBG((uintptr_t)bi);
2251 #if !defined(__xpv)
2252 	DBG((uintptr_t)mb_info);
2253 	DBG((uintptr_t)mb2_info);
2254 	if (mb2_info != NULL)
2255 		DBG(mb2_info->mbi_total_size);
2256 	DBG(bi->bi_acpi_rsdp);
2257 	DBG(bi->bi_smbios);
2258 	DBG(bi->bi_uefi_arch);
2259 	DBG(bi->bi_uefi_systab);
2260 
2261 	if (bi->bi_uefi_systab && prom_debug) {
2262 		if (bi->bi_uefi_arch == XBI_UEFI_ARCH_64) {
2263 			print_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
2264 			    bi->bi_uefi_systab);
2265 		} else {
2266 			print_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
2267 			    bi->bi_uefi_systab);
2268 		}
2269 	}
2270 #endif
2271 
2272 	/*
2273 	 * Need correct target_kernel_text value
2274 	 */
2275 #if defined(_BOOT_TARGET_amd64)
2276 	target_kernel_text = KERNEL_TEXT_amd64;
2277 #elif defined(__xpv)
2278 	target_kernel_text = KERNEL_TEXT_i386_xpv;
2279 #else
2280 	target_kernel_text = KERNEL_TEXT_i386;
2281 #endif
2282 	DBG(target_kernel_text);
2283 
2284 #if defined(__xpv)
2285 
2286 	/*
2287 	 * XXPV	Derive this stuff from CPUID / what the hypervisor has enabled
2288 	 */
2289 
2290 #if defined(_BOOT_TARGET_amd64)
2291 	/*
2292 	 * 64-bit hypervisor.
2293 	 */
2294 	amd64_support = 1;
2295 	pae_support = 1;
2296 
2297 #else	/* _BOOT_TARGET_amd64 */
2298 
2299 	/*
2300 	 * See if we are running on a PAE Hypervisor
2301 	 */
2302 	{
2303 		xen_capabilities_info_t caps;
2304 
2305 		if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
2306 			dboot_panic("HYPERVISOR_xen_version(caps) failed");
2307 		caps[sizeof (caps) - 1] = 0;
2308 		if (prom_debug)
2309 			dboot_printf("xen capabilities %s\n", caps);
2310 		if (strstr(caps, "x86_32p") != NULL)
2311 			pae_support = 1;
2312 	}
2313 
2314 #endif	/* _BOOT_TARGET_amd64 */
2315 	{
2316 		xen_platform_parameters_t p;
2317 
2318 		if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
2319 			dboot_panic("HYPERVISOR_xen_version(parms) failed");
2320 		DBG(p.virt_start);
2321 		mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
2322 	}
2323 
2324 	/*
2325 	 * The hypervisor loads stuff starting at 1Gig
2326 	 */
2327 	mfn_base = ONE_GIG;
2328 	DBG(mfn_base);
2329 
2330 	/*
2331 	 * enable writable page table mode for the hypervisor
2332 	 */
2333 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2334 	    VMASST_TYPE_writable_pagetables) < 0)
2335 		dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
2336 
2337 	/*
2338 	 * check for NX support
2339 	 */
2340 	if (pae_support) {
2341 		uint32_t eax = 0x80000000;
2342 		uint32_t edx = get_cpuid_edx(&eax);
2343 
2344 		if (eax >= 0x80000001) {
2345 			eax = 0x80000001;
2346 			edx = get_cpuid_edx(&eax);
2347 			if (edx & CPUID_AMD_EDX_NX)
2348 				NX_support = 1;
2349 		}
2350 	}
2351 
2352 #if !defined(_BOOT_TARGET_amd64)
2353 
2354 	/*
2355 	 * The 32-bit hypervisor uses segmentation to protect itself from
2356 	 * guests. This means when a guest attempts to install a flat 4GB
2357 	 * code or data descriptor the 32-bit hypervisor will protect itself
2358 	 * by silently shrinking the segment such that if the guest attempts
2359 	 * any access where the hypervisor lives a #gp fault is generated.
2360 	 * The problem is that some applications expect a full 4GB flat
2361 	 * segment for their current thread pointer and will use negative
2362 	 * offset segment wrap around to access data. TLS support in linux
2363 	 * brand is one example of this.
2364 	 *
2365 	 * The 32-bit hypervisor can catch the #gp fault in these cases
2366 	 * and emulate the access without passing the #gp fault to the guest
2367 	 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
2368 	 * Seems like this should have been the default.
2369 	 * Either way, we want the hypervisor -- and not Solaris -- to deal
2370 	 * to deal with emulating these accesses.
2371 	 */
2372 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2373 	    VMASST_TYPE_4gb_segments) < 0)
2374 		dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
2375 #endif	/* !_BOOT_TARGET_amd64 */
2376 
2377 #else	/* __xpv */
2378 
2379 	/*
2380 	 * use cpuid to enable MMU features
2381 	 */
2382 	if (have_cpuid()) {
2383 		uint32_t eax, edx;
2384 
2385 		eax = 1;
2386 		edx = get_cpuid_edx(&eax);
2387 		if (edx & CPUID_INTC_EDX_PSE)
2388 			largepage_support = 1;
2389 		if (edx & CPUID_INTC_EDX_PGE)
2390 			pge_support = 1;
2391 		if (edx & CPUID_INTC_EDX_PAE)
2392 			pae_support = 1;
2393 
2394 		eax = 0x80000000;
2395 		edx = get_cpuid_edx(&eax);
2396 		if (eax >= 0x80000001) {
2397 			eax = 0x80000001;
2398 			edx = get_cpuid_edx(&eax);
2399 			if (edx & CPUID_AMD_EDX_LM)
2400 				amd64_support = 1;
2401 			if (edx & CPUID_AMD_EDX_NX)
2402 				NX_support = 1;
2403 		}
2404 	} else {
2405 		dboot_printf("cpuid not supported\n");
2406 	}
2407 #endif /* __xpv */
2408 
2409 
2410 #if defined(_BOOT_TARGET_amd64)
2411 	if (amd64_support == 0)
2412 		dboot_panic("long mode not supported, rebooting");
2413 	else if (pae_support == 0)
2414 		dboot_panic("long mode, but no PAE; rebooting");
2415 #else
2416 	/*
2417 	 * Allow the command line to over-ride use of PAE for 32 bit.
2418 	 */
2419 	if (strstr(cmdline, "disablePAE=true") != NULL) {
2420 		pae_support = 0;
2421 		NX_support = 0;
2422 		amd64_support = 0;
2423 	}
2424 #endif
2425 
2426 	/*
2427 	 * initialize the simple memory allocator
2428 	 */
2429 	init_mem_alloc();
2430 
2431 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
2432 	/*
2433 	 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
2434 	 */
2435 	if (max_mem < FOUR_GIG && NX_support == 0)
2436 		pae_support = 0;
2437 #endif
2438 
2439 	/*
2440 	 * configure mmu information
2441 	 */
2442 	if (pae_support) {
2443 		shift_amt = shift_amt_pae;
2444 		ptes_per_table = 512;
2445 		pte_size = 8;
2446 		lpagesize = TWO_MEG;
2447 #if defined(_BOOT_TARGET_amd64)
2448 		top_level = 3;
2449 #else
2450 		top_level = 2;
2451 #endif
2452 	} else {
2453 		pae_support = 0;
2454 		NX_support = 0;
2455 		shift_amt = shift_amt_nopae;
2456 		ptes_per_table = 1024;
2457 		pte_size = 4;
2458 		lpagesize = FOUR_MEG;
2459 		top_level = 1;
2460 	}
2461 
2462 	DBG(pge_support);
2463 	DBG(NX_support);
2464 	DBG(largepage_support);
2465 	DBG(amd64_support);
2466 	DBG(top_level);
2467 	DBG(pte_size);
2468 	DBG(ptes_per_table);
2469 	DBG(lpagesize);
2470 
2471 #if defined(__xpv)
2472 	ktext_phys = ONE_GIG;		/* from UNIX Mapfile */
2473 #else
2474 	ktext_phys = FOUR_MEG;		/* from UNIX Mapfile */
2475 #endif
2476 
2477 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
2478 	/*
2479 	 * For grub, copy kernel bits from the ELF64 file to final place.
2480 	 */
2481 	DBG_MSG("\nAllocating nucleus pages.\n");
2482 	ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
2483 
2484 	if (ktext_phys == 0)
2485 		dboot_panic("failed to allocate aligned kernel memory");
2486 	DBG(load_addr);
2487 	if (dboot_elfload64(load_addr) != 0)
2488 		dboot_panic("failed to parse kernel ELF image, rebooting");
2489 #endif
2490 
2491 	DBG(ktext_phys);
2492 
2493 	/*
2494 	 * Allocate page tables.
2495 	 */
2496 	build_page_tables();
2497 
2498 	/*
2499 	 * return to assembly code to switch to running kernel
2500 	 */
2501 	entry_addr_low = (uint32_t)target_kernel_text;
2502 	DBG(entry_addr_low);
2503 	bi->bi_use_largepage = largepage_support;
2504 	bi->bi_use_pae = pae_support;
2505 	bi->bi_use_pge = pge_support;
2506 	bi->bi_use_nx = NX_support;
2507 
2508 #if defined(__xpv)
2509 
2510 	bi->bi_next_paddr = next_avail_addr - mfn_base;
2511 	DBG(bi->bi_next_paddr);
2512 	bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2513 	DBG(bi->bi_next_vaddr);
2514 
2515 	/*
2516 	 * unmap unused pages in start area to make them available for DMA
2517 	 */
2518 	while (next_avail_addr < scratch_end) {
2519 		(void) HYPERVISOR_update_va_mapping(next_avail_addr,
2520 		    0, UVMF_INVLPG | UVMF_LOCAL);
2521 		next_avail_addr += MMU_PAGESIZE;
2522 	}
2523 
2524 	bi->bi_xen_start_info = (native_ptr_t)(uintptr_t)xen_info;
2525 	DBG((uintptr_t)HYPERVISOR_shared_info);
2526 	bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
2527 	bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
2528 
2529 #else /* __xpv */
2530 
2531 	bi->bi_next_paddr = next_avail_addr;
2532 	DBG(bi->bi_next_paddr);
2533 	bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2534 	DBG(bi->bi_next_vaddr);
2535 	bi->bi_mb_version = multiboot_version;
2536 
2537 	switch (multiboot_version) {
2538 	case 1:
2539 		bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb_info;
2540 		break;
2541 	case 2:
2542 		bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb2_info;
2543 		break;
2544 	default:
2545 		dboot_panic("Unknown multiboot version: %d\n",
2546 		    multiboot_version);
2547 		break;
2548 	}
2549 	bi->bi_top_page_table = (uintptr_t)top_page_table;
2550 
2551 #endif /* __xpv */
2552 
2553 	bi->bi_kseg_size = FOUR_MEG;
2554 	DBG(bi->bi_kseg_size);
2555 
2556 #ifndef __xpv
2557 	if (map_debug)
2558 		dump_tables();
2559 #endif
2560 
2561 	DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
2562 }
2563