xref: /illumos-gate/usr/src/uts/i86pc/dboot/dboot_startkern.c (revision 187670a04e7557914566fc449b4d3af38caea282)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright 2012 Joyent, Inc.  All rights reserved.
27  */
28 
29 
30 #include <sys/types.h>
31 #include <sys/machparam.h>
32 #include <sys/x86_archext.h>
33 #include <sys/systm.h>
34 #include <sys/mach_mmu.h>
35 #include <sys/multiboot.h>
36 #include <sys/sha1.h>
37 
38 #if defined(__xpv)
39 
40 #include <sys/hypervisor.h>
41 uintptr_t xen_virt_start;
42 pfn_t *mfn_to_pfn_mapping;
43 
44 #else /* !__xpv */
45 
46 extern multiboot_header_t mb_header;
47 extern int have_cpuid(void);
48 
49 #endif /* !__xpv */
50 
51 #include <sys/inttypes.h>
52 #include <sys/bootinfo.h>
53 #include <sys/mach_mmu.h>
54 #include <sys/boot_console.h>
55 
56 #include "dboot_asm.h"
57 #include "dboot_printf.h"
58 #include "dboot_xboot.h"
59 #include "dboot_elfload.h"
60 
61 #define	SHA1_ASCII_LENGTH	(SHA1_DIGEST_LENGTH * 2)
62 
63 /*
64  * This file contains code that runs to transition us from either a multiboot
65  * compliant loader (32 bit non-paging) or a XPV domain loader to
66  * regular kernel execution. Its task is to setup the kernel memory image
67  * and page tables.
68  *
69  * The code executes as:
70  *	- 32 bits under GRUB (for 32 or 64 bit Solaris)
71  * 	- a 32 bit program for the 32-bit PV hypervisor
72  *	- a 64 bit program for the 64-bit PV hypervisor (at least for now)
73  *
74  * Under the PV hypervisor, we must create mappings for any memory beyond the
75  * initial start of day allocation (such as the kernel itself).
76  *
77  * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
78  * Since we are running in real mode, so all such memory is accessible.
79  */
80 
81 /*
82  * Standard bits used in PTE (page level) and PTP (internal levels)
83  */
84 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
85 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
86 
87 /*
88  * This is the target addresses (physical) where the kernel text and data
89  * nucleus pages will be unpacked. On the hypervisor this is actually a
90  * virtual address.
91  */
92 paddr_t ktext_phys;
93 uint32_t ksize = 2 * FOUR_MEG;	/* kernel nucleus is 8Meg */
94 
95 static uint64_t target_kernel_text;	/* value to use for KERNEL_TEXT */
96 
97 /*
98  * The stack is setup in assembler before entering startup_kernel()
99  */
100 char stack_space[STACK_SIZE];
101 
102 /*
103  * Used to track physical memory allocation
104  */
105 static paddr_t next_avail_addr = 0;
106 
107 #if defined(__xpv)
108 /*
109  * Additional information needed for hypervisor memory allocation.
110  * Only memory up to scratch_end is mapped by page tables.
111  * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
112  * to derive a pfn from a pointer, you subtract mfn_base.
113  */
114 
115 static paddr_t scratch_end = 0;	/* we can't write all of mem here */
116 static paddr_t mfn_base;		/* addr corresponding to mfn_list[0] */
117 start_info_t *xen_info;
118 
119 #else	/* __xpv */
120 
121 /*
122  * If on the metal, then we have a multiboot loader.
123  */
124 multiboot_info_t *mb_info;
125 
126 #endif	/* __xpv */
127 
128 /*
129  * This contains information passed to the kernel
130  */
131 struct xboot_info boot_info[2];	/* extra space to fix alignement for amd64 */
132 struct xboot_info *bi;
133 
134 /*
135  * Page table and memory stuff.
136  */
137 static paddr_t max_mem;			/* maximum memory address */
138 
139 /*
140  * Information about processor MMU
141  */
142 int amd64_support = 0;
143 int largepage_support = 0;
144 int pae_support = 0;
145 int pge_support = 0;
146 int NX_support = 0;
147 
148 /*
149  * Low 32 bits of kernel entry address passed back to assembler.
150  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
151  */
152 uint32_t entry_addr_low;
153 
154 /*
155  * Memlists for the kernel. We shouldn't need a lot of these.
156  */
157 #define	MAX_MEMLIST (50)
158 struct boot_memlist memlists[MAX_MEMLIST];
159 uint_t memlists_used = 0;
160 struct boot_memlist pcimemlists[MAX_MEMLIST];
161 uint_t pcimemlists_used = 0;
162 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
163 uint_t rsvdmemlists_used = 0;
164 
165 #define	MAX_MODULES (10)
166 struct boot_modules modules[MAX_MODULES];
167 uint_t modules_used = 0;
168 
169 /*
170  * Debugging macros
171  */
172 uint_t prom_debug = 0;
173 uint_t map_debug = 0;
174 
175 /*
176  * Either hypervisor-specific or grub-specific code builds the initial
177  * memlists. This code does the sort/merge/link for final use.
178  */
179 static void
180 sort_physinstall(void)
181 {
182 	int i;
183 #if !defined(__xpv)
184 	int j;
185 	struct boot_memlist tmp;
186 
187 	/*
188 	 * Now sort the memlists, in case they weren't in order.
189 	 * Yeah, this is a bubble sort; small, simple and easy to get right.
190 	 */
191 	DBG_MSG("Sorting phys-installed list\n");
192 	for (j = memlists_used - 1; j > 0; --j) {
193 		for (i = 0; i < j; ++i) {
194 			if (memlists[i].addr < memlists[i + 1].addr)
195 				continue;
196 			tmp = memlists[i];
197 			memlists[i] = memlists[i + 1];
198 			memlists[i + 1] = tmp;
199 		}
200 	}
201 
202 	/*
203 	 * Merge any memlists that don't have holes between them.
204 	 */
205 	for (i = 0; i <= memlists_used - 1; ++i) {
206 		if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
207 			continue;
208 
209 		if (prom_debug)
210 			dboot_printf(
211 			    "merging mem segs %" PRIx64 "...%" PRIx64
212 			    " w/ %" PRIx64 "...%" PRIx64 "\n",
213 			    memlists[i].addr,
214 			    memlists[i].addr + memlists[i].size,
215 			    memlists[i + 1].addr,
216 			    memlists[i + 1].addr + memlists[i + 1].size);
217 
218 		memlists[i].size += memlists[i + 1].size;
219 		for (j = i + 1; j < memlists_used - 1; ++j)
220 			memlists[j] = memlists[j + 1];
221 		--memlists_used;
222 		DBG(memlists_used);
223 		--i;	/* after merging we need to reexamine, so do this */
224 	}
225 #endif	/* __xpv */
226 
227 	if (prom_debug) {
228 		dboot_printf("\nFinal memlists:\n");
229 		for (i = 0; i < memlists_used; ++i) {
230 			dboot_printf("\t%d: addr=%" PRIx64 " size=%"
231 			    PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
232 		}
233 	}
234 
235 	/*
236 	 * link together the memlists with native size pointers
237 	 */
238 	memlists[0].next = 0;
239 	memlists[0].prev = 0;
240 	for (i = 1; i < memlists_used; ++i) {
241 		memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
242 		memlists[i].next = 0;
243 		memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
244 	}
245 	bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
246 	DBG(bi->bi_phys_install);
247 }
248 
249 /*
250  * build bios reserved memlists
251  */
252 static void
253 build_rsvdmemlists(void)
254 {
255 	int i;
256 
257 	rsvdmemlists[0].next = 0;
258 	rsvdmemlists[0].prev = 0;
259 	for (i = 1; i < rsvdmemlists_used; ++i) {
260 		rsvdmemlists[i].prev =
261 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
262 		rsvdmemlists[i].next = 0;
263 		rsvdmemlists[i - 1].next =
264 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
265 	}
266 	bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
267 	DBG(bi->bi_rsvdmem);
268 }
269 
270 #if defined(__xpv)
271 
272 /*
273  * halt on the hypervisor after a delay to drain console output
274  */
275 void
276 dboot_halt(void)
277 {
278 	uint_t i = 10000;
279 
280 	while (--i)
281 		(void) HYPERVISOR_yield();
282 	(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
283 }
284 
285 /*
286  * From a machine address, find the corresponding pseudo-physical address.
287  * Pseudo-physical address are contiguous and run from mfn_base in each VM.
288  * Machine addresses are the real underlying hardware addresses.
289  * These are needed for page table entries. Note that this routine is
290  * poorly protected. A bad value of "ma" will cause a page fault.
291  */
292 paddr_t
293 ma_to_pa(maddr_t ma)
294 {
295 	ulong_t pgoff = ma & MMU_PAGEOFFSET;
296 	ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
297 	paddr_t pa;
298 
299 	if (pfn >= xen_info->nr_pages)
300 		return (-(paddr_t)1);
301 	pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
302 #ifdef DEBUG
303 	if (ma != pa_to_ma(pa))
304 		dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
305 		    "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
306 #endif
307 	return (pa);
308 }
309 
310 /*
311  * From a pseudo-physical address, find the corresponding machine address.
312  */
313 maddr_t
314 pa_to_ma(paddr_t pa)
315 {
316 	pfn_t pfn;
317 	ulong_t mfn;
318 
319 	pfn = mmu_btop(pa - mfn_base);
320 	if (pa < mfn_base || pfn >= xen_info->nr_pages)
321 		dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
322 	mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
323 #ifdef DEBUG
324 	if (mfn_to_pfn_mapping[mfn] != pfn)
325 		dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
326 		    pfn, mfn, mfn_to_pfn_mapping[mfn]);
327 #endif
328 	return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
329 }
330 
331 #endif	/* __xpv */
332 
333 x86pte_t
334 get_pteval(paddr_t table, uint_t index)
335 {
336 	if (pae_support)
337 		return (((x86pte_t *)(uintptr_t)table)[index]);
338 	return (((x86pte32_t *)(uintptr_t)table)[index]);
339 }
340 
341 /*ARGSUSED*/
342 void
343 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
344 {
345 #ifdef __xpv
346 	mmu_update_t t;
347 	maddr_t mtable = pa_to_ma(table);
348 	int retcnt;
349 
350 	t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
351 	t.val = pteval;
352 	if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
353 		dboot_panic("HYPERVISOR_mmu_update() failed");
354 #else /* __xpv */
355 	uintptr_t tab_addr = (uintptr_t)table;
356 
357 	if (pae_support)
358 		((x86pte_t *)tab_addr)[index] = pteval;
359 	else
360 		((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
361 	if (level == top_level && level == 2)
362 		reload_cr3();
363 #endif /* __xpv */
364 }
365 
366 paddr_t
367 make_ptable(x86pte_t *pteval, uint_t level)
368 {
369 	paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
370 
371 	if (level == top_level && level == 2)
372 		*pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
373 	else
374 		*pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
375 
376 #ifdef __xpv
377 	/* Remove write permission to the new page table. */
378 	if (HYPERVISOR_update_va_mapping(new_table,
379 	    *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
380 		dboot_panic("HYP_update_va_mapping error");
381 #endif
382 
383 	if (map_debug)
384 		dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
385 		    PRIx64 "\n", level, (ulong_t)new_table, *pteval);
386 	return (new_table);
387 }
388 
389 x86pte_t *
390 map_pte(paddr_t table, uint_t index)
391 {
392 	return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
393 }
394 
395 /*
396  * dump out the contents of page tables...
397  */
398 static void
399 dump_tables(void)
400 {
401 	uint_t save_index[4];	/* for recursion */
402 	char *save_table[4];	/* for recursion */
403 	uint_t	l;
404 	uint64_t va;
405 	uint64_t pgsize;
406 	int index;
407 	int i;
408 	x86pte_t pteval;
409 	char *table;
410 	static char *tablist = "\t\t\t";
411 	char *tabs = tablist + 3 - top_level;
412 	uint_t pa, pa1;
413 #if !defined(__xpv)
414 #define	maddr_t paddr_t
415 #endif /* !__xpv */
416 
417 	dboot_printf("Finished pagetables:\n");
418 	table = (char *)(uintptr_t)top_page_table;
419 	l = top_level;
420 	va = 0;
421 	for (index = 0; index < ptes_per_table; ++index) {
422 		pgsize = 1ull << shift_amt[l];
423 		if (pae_support)
424 			pteval = ((x86pte_t *)table)[index];
425 		else
426 			pteval = ((x86pte32_t *)table)[index];
427 		if (pteval == 0)
428 			goto next_entry;
429 
430 		dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
431 		    tabs + l, (void *)table, index, (uint64_t)pteval, va);
432 		pa = ma_to_pa(pteval & MMU_PAGEMASK);
433 		dboot_printf(" physaddr=%x\n", pa);
434 
435 		/*
436 		 * Don't try to walk hypervisor private pagetables
437 		 */
438 		if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
439 			save_table[l] = table;
440 			save_index[l] = index;
441 			--l;
442 			index = -1;
443 			table = (char *)(uintptr_t)
444 			    ma_to_pa(pteval & MMU_PAGEMASK);
445 			goto recursion;
446 		}
447 
448 		/*
449 		 * shorten dump for consecutive mappings
450 		 */
451 		for (i = 1; index + i < ptes_per_table; ++i) {
452 			if (pae_support)
453 				pteval = ((x86pte_t *)table)[index + i];
454 			else
455 				pteval = ((x86pte32_t *)table)[index + i];
456 			if (pteval == 0)
457 				break;
458 			pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
459 			if (pa1 != pa + i * pgsize)
460 				break;
461 		}
462 		if (i > 2) {
463 			dboot_printf("%s...\n", tabs + l);
464 			va += pgsize * (i - 2);
465 			index += i - 2;
466 		}
467 next_entry:
468 		va += pgsize;
469 		if (l == 3 && index == 256)	/* VA hole */
470 			va = 0xffff800000000000ull;
471 recursion:
472 		;
473 	}
474 	if (l < top_level) {
475 		++l;
476 		index = save_index[l];
477 		table = save_table[l];
478 		goto recursion;
479 	}
480 }
481 
482 /*
483  * Add a mapping for the machine page at the given virtual address.
484  */
485 static void
486 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
487 {
488 	x86pte_t *ptep;
489 	x86pte_t pteval;
490 
491 	pteval = ma | pte_bits;
492 	if (level > 0)
493 		pteval |= PT_PAGESIZE;
494 	if (va >= target_kernel_text && pge_support)
495 		pteval |= PT_GLOBAL;
496 
497 	if (map_debug && ma != va)
498 		dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
499 		    " pte=0x%" PRIx64 " l=%d\n",
500 		    (uint64_t)ma, (uint64_t)va, pteval, level);
501 
502 #if defined(__xpv)
503 	/*
504 	 * see if we can avoid find_pte() on the hypervisor
505 	 */
506 	if (HYPERVISOR_update_va_mapping(va, pteval,
507 	    UVMF_INVLPG | UVMF_LOCAL) == 0)
508 		return;
509 #endif
510 
511 	/*
512 	 * Find the pte that will map this address. This creates any
513 	 * missing intermediate level page tables
514 	 */
515 	ptep = find_pte(va, NULL, level, 0);
516 
517 	/*
518 	 * When paravirtualized, we must use hypervisor calls to modify the
519 	 * PTE, since paging is active. On real hardware we just write to
520 	 * the pagetables which aren't in use yet.
521 	 */
522 #if defined(__xpv)
523 	ptep = ptep;	/* shut lint up */
524 	if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
525 		dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
526 		    " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
527 		    (uint64_t)va, level, (uint64_t)ma, pteval);
528 #else
529 	if (va < 1024 * 1024)
530 		pteval |= PT_NOCACHE;		/* for video RAM */
531 	if (pae_support)
532 		*ptep = pteval;
533 	else
534 		*((x86pte32_t *)ptep) = (x86pte32_t)pteval;
535 #endif
536 }
537 
538 /*
539  * Add a mapping for the physical page at the given virtual address.
540  */
541 static void
542 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
543 {
544 	map_ma_at_va(pa_to_ma(pa), va, level);
545 }
546 
547 /*
548  * This is called to remove start..end from the
549  * possible range of PCI addresses.
550  */
551 const uint64_t pci_lo_limit = 0x00100000ul;
552 const uint64_t pci_hi_limit = 0xfff00000ul;
553 static void
554 exclude_from_pci(uint64_t start, uint64_t end)
555 {
556 	int i;
557 	int j;
558 	struct boot_memlist *ml;
559 
560 	for (i = 0; i < pcimemlists_used; ++i) {
561 		ml = &pcimemlists[i];
562 
563 		/* delete the entire range? */
564 		if (start <= ml->addr && ml->addr + ml->size <= end) {
565 			--pcimemlists_used;
566 			for (j = i; j < pcimemlists_used; ++j)
567 				pcimemlists[j] = pcimemlists[j + 1];
568 			--i;	/* to revisit the new one at this index */
569 		}
570 
571 		/* split a range? */
572 		else if (ml->addr < start && end < ml->addr + ml->size) {
573 
574 			++pcimemlists_used;
575 			if (pcimemlists_used > MAX_MEMLIST)
576 				dboot_panic("too many pcimemlists");
577 
578 			for (j = pcimemlists_used - 1; j > i; --j)
579 				pcimemlists[j] = pcimemlists[j - 1];
580 			ml->size = start - ml->addr;
581 
582 			++ml;
583 			ml->size = (ml->addr + ml->size) - end;
584 			ml->addr = end;
585 			++i;	/* skip on to next one */
586 		}
587 
588 		/* cut memory off the start? */
589 		else if (ml->addr < end && end < ml->addr + ml->size) {
590 			ml->size -= end - ml->addr;
591 			ml->addr = end;
592 		}
593 
594 		/* cut memory off the end? */
595 		else if (ml->addr <= start && start < ml->addr + ml->size) {
596 			ml->size = start - ml->addr;
597 		}
598 	}
599 }
600 
601 /*
602  * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
603  * definition in Xen source.
604  */
605 #ifdef __xpv
606 typedef struct {
607 	uint32_t	base_addr_low;
608 	uint32_t	base_addr_high;
609 	uint32_t	length_low;
610 	uint32_t	length_high;
611 	uint32_t	type;
612 } mmap_t;
613 #else
614 typedef mb_memory_map_t mmap_t;
615 #endif
616 
617 static void
618 build_pcimemlists(mmap_t *mem, int num)
619 {
620 	mmap_t *mmap;
621 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
622 	uint64_t start;
623 	uint64_t end;
624 	int i;
625 
626 	/*
627 	 * initialize
628 	 */
629 	pcimemlists[0].addr = pci_lo_limit;
630 	pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
631 	pcimemlists_used = 1;
632 
633 	/*
634 	 * Fill in PCI memlists.
635 	 */
636 	for (mmap = mem, i = 0; i < num; ++i, ++mmap) {
637 		start = ((uint64_t)mmap->base_addr_high << 32) +
638 		    mmap->base_addr_low;
639 		end = start + ((uint64_t)mmap->length_high << 32) +
640 		    mmap->length_low;
641 
642 		if (prom_debug)
643 			dboot_printf("\ttype: %d %" PRIx64 "..%"
644 			    PRIx64 "\n", mmap->type, start, end);
645 
646 		/*
647 		 * page align start and end
648 		 */
649 		start = (start + page_offset) & ~page_offset;
650 		end &= ~page_offset;
651 		if (end <= start)
652 			continue;
653 
654 		exclude_from_pci(start, end);
655 	}
656 
657 	/*
658 	 * Finish off the pcimemlist
659 	 */
660 	if (prom_debug) {
661 		for (i = 0; i < pcimemlists_used; ++i) {
662 			dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
663 			    PRIx64 "\n", pcimemlists[i].addr,
664 			    pcimemlists[i].addr + pcimemlists[i].size);
665 		}
666 	}
667 	pcimemlists[0].next = 0;
668 	pcimemlists[0].prev = 0;
669 	for (i = 1; i < pcimemlists_used; ++i) {
670 		pcimemlists[i].prev =
671 		    (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
672 		pcimemlists[i].next = 0;
673 		pcimemlists[i - 1].next =
674 		    (native_ptr_t)(uintptr_t)(pcimemlists + i);
675 	}
676 	bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
677 	DBG(bi->bi_pcimem);
678 }
679 
680 #if defined(__xpv)
681 /*
682  * Initialize memory allocator stuff from hypervisor-supplied start info.
683  *
684  * There is 512KB of scratch area after the boot stack page.
685  * We'll use that for everything except the kernel nucleus pages which are too
686  * big to fit there and are allocated last anyway.
687  */
688 #define	MAXMAPS	100
689 static mmap_t map_buffer[MAXMAPS];
690 static void
691 init_mem_alloc(void)
692 {
693 	int	local;	/* variables needed to find start region */
694 	paddr_t	scratch_start;
695 	xen_memory_map_t map;
696 
697 	DBG_MSG("Entered init_mem_alloc()\n");
698 
699 	/*
700 	 * Free memory follows the stack. There's at least 512KB of scratch
701 	 * space, rounded up to at least 2Mb alignment.  That should be enough
702 	 * for the page tables we'll need to build.  The nucleus memory is
703 	 * allocated last and will be outside the addressible range.  We'll
704 	 * switch to new page tables before we unpack the kernel
705 	 */
706 	scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
707 	DBG(scratch_start);
708 	scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
709 	DBG(scratch_end);
710 
711 	/*
712 	 * For paranoia, leave some space between hypervisor data and ours.
713 	 * Use 500 instead of 512.
714 	 */
715 	next_avail_addr = scratch_end - 500 * 1024;
716 	DBG(next_avail_addr);
717 
718 	/*
719 	 * The domain builder gives us at most 1 module
720 	 */
721 	DBG(xen_info->mod_len);
722 	if (xen_info->mod_len > 0) {
723 		DBG(xen_info->mod_start);
724 		modules[0].bm_addr = xen_info->mod_start;
725 		modules[0].bm_size = xen_info->mod_len;
726 		bi->bi_module_cnt = 1;
727 		bi->bi_modules = (native_ptr_t)modules;
728 	} else {
729 		bi->bi_module_cnt = 0;
730 		bi->bi_modules = NULL;
731 	}
732 	DBG(bi->bi_module_cnt);
733 	DBG(bi->bi_modules);
734 
735 	DBG(xen_info->mfn_list);
736 	DBG(xen_info->nr_pages);
737 	max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
738 	DBG(max_mem);
739 
740 	/*
741 	 * Using pseudo-physical addresses, so only 1 memlist element
742 	 */
743 	memlists[0].addr = 0;
744 	DBG(memlists[0].addr);
745 	memlists[0].size = max_mem;
746 	DBG(memlists[0].size);
747 	memlists_used = 1;
748 	DBG(memlists_used);
749 
750 	/*
751 	 * finish building physinstall list
752 	 */
753 	sort_physinstall();
754 
755 	/*
756 	 * build bios reserved memlists
757 	 */
758 	build_rsvdmemlists();
759 
760 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
761 		/*
762 		 * build PCI Memory list
763 		 */
764 		map.nr_entries = MAXMAPS;
765 		/*LINTED: constant in conditional context*/
766 		set_xen_guest_handle(map.buffer, map_buffer);
767 		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
768 			dboot_panic("getting XENMEM_machine_memory_map failed");
769 		build_pcimemlists(map_buffer, map.nr_entries);
770 	}
771 }
772 
773 #else	/* !__xpv */
774 
775 static uint8_t
776 dboot_a2h(char v)
777 {
778 	if (v >= 'a')
779 		return (v - 'a' + 0xa);
780 	else if (v >= 'A')
781 		return (v - 'A' + 0xa);
782 	else if (v >= '0')
783 		return (v - '0');
784 	else
785 		dboot_panic("bad ASCII hex character %c\n", v);
786 
787 	return (0);
788 }
789 
790 static void
791 digest_a2h(const char *ascii, uint8_t *digest)
792 {
793 	unsigned int i;
794 
795 	for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
796 		digest[i] = dboot_a2h(ascii[i * 2]) << 4;
797 		digest[i] |= dboot_a2h(ascii[i * 2 + 1]);
798 	}
799 }
800 
801 /*
802  * Generate a SHA-1 hash of the first len bytes of image, and compare it with
803  * the ASCII-format hash found in the 40-byte buffer at ascii.  If they
804  * match, return 0, otherwise -1.  This works only for images smaller than
805  * 4 GB, which should not be a problem.
806  */
807 static int
808 check_image_hash(const char *ascii, const void *image, size_t len)
809 {
810 	SHA1_CTX ctx;
811 	uint8_t digest[SHA1_DIGEST_LENGTH];
812 	uint8_t baseline[SHA1_DIGEST_LENGTH];
813 	unsigned int i;
814 
815 	digest_a2h(ascii, baseline);
816 
817 	SHA1Init(&ctx);
818 	SHA1Update(&ctx, image, len);
819 	SHA1Final(digest, &ctx);
820 
821 	for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
822 		if (digest[i] != baseline[i])
823 			return (-1);
824 	}
825 
826 	return (0);
827 }
828 
829 static void
830 check_images(void)
831 {
832 	int i;
833 	char *hashes;
834 	mb_module_t *mod, *hashmod;
835 	char *hash;
836 	char displayhash[SHA1_ASCII_LENGTH + 1];
837 	size_t hashlen;
838 	size_t len;
839 
840 	/*
841 	 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
842 	 * the address of the last valid byte in a module plus 1 as mod_end.
843 	 * This is of course a bug; the multiboot specification simply states
844 	 * that mod_start and mod_end "contain the start and end addresses of
845 	 * the boot module itself" which is pretty obviously not what GRUB is
846 	 * doing.  However, fixing it requires that not only this code be
847 	 * changed but also that other code consuming this value and values
848 	 * derived from it be fixed, and that the kernel and GRUB must either
849 	 * both have the bug or neither.  While there are a lot of combinations
850 	 * that will work, there are also some that won't, so for simplicity
851 	 * we'll just cope with the bug.  That means we won't actually hash the
852 	 * byte at mod_end, and we will expect that mod_end for the hash file
853 	 * itself is one greater than some multiple of 41 (40 bytes of ASCII
854 	 * hash plus a newline for each module).
855 	 */
856 
857 	if (mb_info->mods_count > 1) {
858 		mod = (mb_module_t *)mb_info->mods_addr;
859 		hashmod = mod + (mb_info->mods_count - 1);
860 		hashes = (char *)hashmod->mod_start;
861 		hashlen = (size_t)(hashmod->mod_end - hashmod->mod_start);
862 		hash = hashes;
863 		if (prom_debug) {
864 			dboot_printf("Hash module found at %lx size %lx\n",
865 			    (ulong_t)hashes, (ulong_t)hashlen);
866 		}
867 	} else {
868 		DBG_MSG("Skipping hash check; no hash module found.\n");
869 		return;
870 	}
871 
872 	for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
873 	    i < mb_info->mods_count - 1; ++mod, ++i) {
874 		if ((hash - hashes) + SHA1_ASCII_LENGTH + 1 > hashlen) {
875 			dboot_printf("Short hash module of length 0x%lx bytes; "
876 			    "skipping hash checks\n", (ulong_t)hashlen);
877 			break;
878 		}
879 
880 		(void) memcpy(displayhash, hash, SHA1_ASCII_LENGTH);
881 		displayhash[SHA1_ASCII_LENGTH] = '\0';
882 		if (prom_debug) {
883 			dboot_printf("Checking hash for module %d [%s]: ",
884 			    i, displayhash);
885 		}
886 
887 		len = mod->mod_end - mod->mod_start;	/* see above */
888 		if (check_image_hash(hash, (void *)mod->mod_start, len) != 0) {
889 			dboot_panic("SHA-1 hash mismatch on %s; expected %s\n",
890 			    (char *)mod->mod_name, displayhash);
891 		} else {
892 			DBG_MSG("OK\n");
893 		}
894 		hash += SHA1_ASCII_LENGTH + 1;
895 	}
896 }
897 
898 /*
899  * During memory allocation, find the highest address not used yet.
900  */
901 static void
902 check_higher(paddr_t a)
903 {
904 	if (a < next_avail_addr)
905 		return;
906 	next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
907 	DBG(next_avail_addr);
908 }
909 
910 /*
911  * Walk through the module information finding the last used address.
912  * The first available address will become the top level page table.
913  *
914  * We then build the phys_install memlist from the multiboot information.
915  */
916 static void
917 init_mem_alloc(void)
918 {
919 	mb_memory_map_t *mmap;
920 	mb_module_t *mod;
921 	uint64_t start;
922 	uint64_t end;
923 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
924 	extern char _end[];
925 	int i;
926 
927 	DBG_MSG("Entered init_mem_alloc()\n");
928 	DBG((uintptr_t)mb_info);
929 
930 	if (mb_info->mods_count > MAX_MODULES) {
931 		dboot_panic("Too many modules (%d) -- the maximum is %d.",
932 		    mb_info->mods_count, MAX_MODULES);
933 	}
934 	/*
935 	 * search the modules to find the last used address
936 	 * we'll build the module list while we're walking through here
937 	 */
938 	DBG_MSG("\nFinding Modules\n");
939 	check_higher((paddr_t)(uintptr_t)&_end);
940 	for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
941 	    i < mb_info->mods_count;
942 	    ++mod, ++i) {
943 		if (prom_debug) {
944 			dboot_printf("\tmodule #%d: %s at: 0x%lx, end 0x%lx\n",
945 			    i, (char *)(mod->mod_name),
946 			    (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
947 		}
948 		modules[i].bm_addr = mod->mod_start;
949 		if (mod->mod_start > mod->mod_end) {
950 			dboot_panic("module[%d]: Invalid module start address "
951 			    "(0x%llx)", i, (uint64_t)mod->mod_start);
952 		}
953 		modules[i].bm_size = mod->mod_end - mod->mod_start;
954 
955 		check_higher(mod->mod_end);
956 	}
957 	bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
958 	DBG(bi->bi_modules);
959 	bi->bi_module_cnt = mb_info->mods_count;
960 	DBG(bi->bi_module_cnt);
961 
962 	check_images();
963 
964 	/*
965 	 * Walk through the memory map from multiboot and build our memlist
966 	 * structures. Note these will have native format pointers.
967 	 */
968 	DBG_MSG("\nFinding Memory Map\n");
969 	DBG(mb_info->flags);
970 	max_mem = 0;
971 	if (mb_info->flags & 0x40) {
972 		int cnt = 0;
973 
974 		DBG(mb_info->mmap_addr);
975 		DBG(mb_info->mmap_length);
976 		check_higher(mb_info->mmap_addr + mb_info->mmap_length);
977 
978 		for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
979 		    (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length;
980 		    mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size
981 		    + sizeof (mmap->size))) {
982 			++cnt;
983 			start = ((uint64_t)mmap->base_addr_high << 32) +
984 			    mmap->base_addr_low;
985 			end = start + ((uint64_t)mmap->length_high << 32) +
986 			    mmap->length_low;
987 
988 			if (prom_debug)
989 				dboot_printf("\ttype: %d %" PRIx64 "..%"
990 				    PRIx64 "\n", mmap->type, start, end);
991 
992 			/*
993 			 * page align start and end
994 			 */
995 			start = (start + page_offset) & ~page_offset;
996 			end &= ~page_offset;
997 			if (end <= start)
998 				continue;
999 
1000 			/*
1001 			 * only type 1 is usable RAM
1002 			 */
1003 			switch (mmap->type) {
1004 			case 1:
1005 				if (end > max_mem)
1006 					max_mem = end;
1007 				memlists[memlists_used].addr = start;
1008 				memlists[memlists_used].size = end - start;
1009 				++memlists_used;
1010 				if (memlists_used > MAX_MEMLIST)
1011 					dboot_panic("too many memlists");
1012 				break;
1013 			case 2:
1014 				rsvdmemlists[rsvdmemlists_used].addr = start;
1015 				rsvdmemlists[rsvdmemlists_used].size =
1016 				    end - start;
1017 				++rsvdmemlists_used;
1018 				if (rsvdmemlists_used > MAX_MEMLIST)
1019 					dboot_panic("too many rsvdmemlists");
1020 				break;
1021 			default:
1022 				continue;
1023 			}
1024 		}
1025 		build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt);
1026 	} else if (mb_info->flags & 0x01) {
1027 		DBG(mb_info->mem_lower);
1028 		memlists[memlists_used].addr = 0;
1029 		memlists[memlists_used].size = mb_info->mem_lower * 1024;
1030 		++memlists_used;
1031 		DBG(mb_info->mem_upper);
1032 		memlists[memlists_used].addr = 1024 * 1024;
1033 		memlists[memlists_used].size = mb_info->mem_upper * 1024;
1034 		++memlists_used;
1035 
1036 		/*
1037 		 * Old platform - assume I/O space at the end of memory.
1038 		 */
1039 		pcimemlists[0].addr =
1040 		    (mb_info->mem_upper * 1024) + (1024 * 1024);
1041 		pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
1042 		pcimemlists[0].next = 0;
1043 		pcimemlists[0].prev = 0;
1044 		bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
1045 		DBG(bi->bi_pcimem);
1046 	} else {
1047 		dboot_panic("No memory info from boot loader!!!");
1048 	}
1049 
1050 	check_higher(bi->bi_cmdline);
1051 
1052 	/*
1053 	 * finish processing the physinstall list
1054 	 */
1055 	sort_physinstall();
1056 
1057 	/*
1058 	 * build bios reserved mem lists
1059 	 */
1060 	build_rsvdmemlists();
1061 }
1062 #endif /* !__xpv */
1063 
1064 /*
1065  * Simple memory allocator, allocates aligned physical memory.
1066  * Note that startup_kernel() only allocates memory, never frees.
1067  * Memory usage just grows in an upward direction.
1068  */
1069 static void *
1070 do_mem_alloc(uint32_t size, uint32_t align)
1071 {
1072 	uint_t i;
1073 	uint64_t best;
1074 	uint64_t start;
1075 	uint64_t end;
1076 
1077 	/*
1078 	 * make sure size is a multiple of pagesize
1079 	 */
1080 	size = RNDUP(size, MMU_PAGESIZE);
1081 	next_avail_addr = RNDUP(next_avail_addr, align);
1082 
1083 	/*
1084 	 * XXPV fixme joe
1085 	 *
1086 	 * a really large bootarchive that causes you to run out of memory
1087 	 * may cause this to blow up
1088 	 */
1089 	/* LINTED E_UNEXPECTED_UINT_PROMOTION */
1090 	best = (uint64_t)-size;
1091 	for (i = 0; i < memlists_used; ++i) {
1092 		start = memlists[i].addr;
1093 #if defined(__xpv)
1094 		start += mfn_base;
1095 #endif
1096 		end = start + memlists[i].size;
1097 
1098 		/*
1099 		 * did we find the desired address?
1100 		 */
1101 		if (start <= next_avail_addr && next_avail_addr + size <= end) {
1102 			best = next_avail_addr;
1103 			goto done;
1104 		}
1105 
1106 		/*
1107 		 * if not is this address the best so far?
1108 		 */
1109 		if (start > next_avail_addr && start < best &&
1110 		    RNDUP(start, align) + size <= end)
1111 			best = RNDUP(start, align);
1112 	}
1113 
1114 	/*
1115 	 * We didn't find exactly the address we wanted, due to going off the
1116 	 * end of a memory region. Return the best found memory address.
1117 	 */
1118 done:
1119 	next_avail_addr = best + size;
1120 #if defined(__xpv)
1121 	if (next_avail_addr > scratch_end)
1122 		dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
1123 		    "0x%lx", (ulong_t)next_avail_addr,
1124 		    (ulong_t)scratch_end);
1125 #endif
1126 	(void) memset((void *)(uintptr_t)best, 0, size);
1127 	return ((void *)(uintptr_t)best);
1128 }
1129 
1130 void *
1131 mem_alloc(uint32_t size)
1132 {
1133 	return (do_mem_alloc(size, MMU_PAGESIZE));
1134 }
1135 
1136 
1137 /*
1138  * Build page tables to map all of memory used so far as well as the kernel.
1139  */
1140 static void
1141 build_page_tables(void)
1142 {
1143 	uint32_t psize;
1144 	uint32_t level;
1145 	uint32_t off;
1146 	uint64_t start;
1147 #if !defined(__xpv)
1148 	uint32_t i;
1149 	uint64_t end;
1150 #endif	/* __xpv */
1151 
1152 	/*
1153 	 * If we're on metal, we need to create the top level pagetable.
1154 	 */
1155 #if defined(__xpv)
1156 	top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1157 #else /* __xpv */
1158 	top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1159 #endif /* __xpv */
1160 	DBG((uintptr_t)top_page_table);
1161 
1162 	/*
1163 	 * Determine if we'll use large mappings for kernel, then map it.
1164 	 */
1165 	if (largepage_support) {
1166 		psize = lpagesize;
1167 		level = 1;
1168 	} else {
1169 		psize = MMU_PAGESIZE;
1170 		level = 0;
1171 	}
1172 
1173 	DBG_MSG("Mapping kernel\n");
1174 	DBG(ktext_phys);
1175 	DBG(target_kernel_text);
1176 	DBG(ksize);
1177 	DBG(psize);
1178 	for (off = 0; off < ksize; off += psize)
1179 		map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1180 
1181 	/*
1182 	 * The kernel will need a 1 page window to work with page tables
1183 	 */
1184 	bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE);
1185 	DBG(bi->bi_pt_window);
1186 	bi->bi_pte_to_pt_window =
1187 	    (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1188 	DBG(bi->bi_pte_to_pt_window);
1189 
1190 #if defined(__xpv)
1191 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1192 		/* If this is a domU we're done. */
1193 		DBG_MSG("\nPage tables constructed\n");
1194 		return;
1195 	}
1196 #endif /* __xpv */
1197 
1198 	/*
1199 	 * We need 1:1 mappings for the lower 1M of memory to access
1200 	 * BIOS tables used by a couple of drivers during boot.
1201 	 *
1202 	 * The following code works because our simple memory allocator
1203 	 * only grows usage in an upwards direction.
1204 	 *
1205 	 * Note that by this point in boot some mappings for low memory
1206 	 * may already exist because we've already accessed device in low
1207 	 * memory.  (Specifically the video frame buffer and keyboard
1208 	 * status ports.)  If we're booting on raw hardware then GRUB
1209 	 * created these mappings for us.  If we're booting under a
1210 	 * hypervisor then we went ahead and remapped these devices into
1211 	 * memory allocated within dboot itself.
1212 	 */
1213 	if (map_debug)
1214 		dboot_printf("1:1 map pa=0..1Meg\n");
1215 	for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
1216 #if defined(__xpv)
1217 		map_ma_at_va(start, start, 0);
1218 #else /* __xpv */
1219 		map_pa_at_va(start, start, 0);
1220 #endif /* __xpv */
1221 	}
1222 
1223 #if !defined(__xpv)
1224 	for (i = 0; i < memlists_used; ++i) {
1225 		start = memlists[i].addr;
1226 
1227 		end = start + memlists[i].size;
1228 
1229 		if (map_debug)
1230 			dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
1231 			    start, end);
1232 		while (start < end && start < next_avail_addr) {
1233 			map_pa_at_va(start, start, 0);
1234 			start += MMU_PAGESIZE;
1235 		}
1236 	}
1237 #endif /* !__xpv */
1238 
1239 	DBG_MSG("\nPage tables constructed\n");
1240 }
1241 
1242 #define	NO_MULTIBOOT	\
1243 "multiboot is no longer used to boot the Solaris Operating System.\n\
1244 The grub entry should be changed to:\n\
1245 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
1246 module$ /platform/i86pc/$ISADIR/boot_archive\n\
1247 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
1248 
1249 /*
1250  * startup_kernel has a pretty simple job. It builds pagetables which reflect
1251  * 1:1 mappings for all memory in use. It then also adds mappings for
1252  * the kernel nucleus at virtual address of target_kernel_text using large page
1253  * mappings. The page table pages are also accessible at 1:1 mapped
1254  * virtual addresses.
1255  */
1256 /*ARGSUSED*/
1257 void
1258 startup_kernel(void)
1259 {
1260 	char *cmdline;
1261 	uintptr_t addr;
1262 #if defined(__xpv)
1263 	physdev_set_iopl_t set_iopl;
1264 #endif /* __xpv */
1265 
1266 	/*
1267 	 * At this point we are executing in a 32 bit real mode.
1268 	 */
1269 #if defined(__xpv)
1270 	cmdline = (char *)xen_info->cmd_line;
1271 #else /* __xpv */
1272 	cmdline = (char *)mb_info->cmdline;
1273 #endif /* __xpv */
1274 
1275 	prom_debug = (strstr(cmdline, "prom_debug") != NULL);
1276 	map_debug = (strstr(cmdline, "map_debug") != NULL);
1277 
1278 #if defined(__xpv)
1279 	/*
1280 	 * For dom0, before we initialize the console subsystem we'll
1281 	 * need to enable io operations, so set I/O priveldge level to 1.
1282 	 */
1283 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1284 		set_iopl.iopl = 1;
1285 		(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1286 	}
1287 #endif /* __xpv */
1288 
1289 	bcons_init(cmdline);
1290 	DBG_MSG("\n\nSolaris prekernel set: ");
1291 	DBG_MSG(cmdline);
1292 	DBG_MSG("\n");
1293 
1294 	if (strstr(cmdline, "multiboot") != NULL) {
1295 		dboot_panic(NO_MULTIBOOT);
1296 	}
1297 
1298 	/*
1299 	 * boot info must be 16 byte aligned for 64 bit kernel ABI
1300 	 */
1301 	addr = (uintptr_t)boot_info;
1302 	addr = (addr + 0xf) & ~0xf;
1303 	bi = (struct xboot_info *)addr;
1304 	DBG((uintptr_t)bi);
1305 	bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
1306 
1307 	/*
1308 	 * Need correct target_kernel_text value
1309 	 */
1310 #if defined(_BOOT_TARGET_amd64)
1311 	target_kernel_text = KERNEL_TEXT_amd64;
1312 #elif defined(__xpv)
1313 	target_kernel_text = KERNEL_TEXT_i386_xpv;
1314 #else
1315 	target_kernel_text = KERNEL_TEXT_i386;
1316 #endif
1317 	DBG(target_kernel_text);
1318 
1319 #if defined(__xpv)
1320 
1321 	/*
1322 	 * XXPV	Derive this stuff from CPUID / what the hypervisor has enabled
1323 	 */
1324 
1325 #if defined(_BOOT_TARGET_amd64)
1326 	/*
1327 	 * 64-bit hypervisor.
1328 	 */
1329 	amd64_support = 1;
1330 	pae_support = 1;
1331 
1332 #else	/* _BOOT_TARGET_amd64 */
1333 
1334 	/*
1335 	 * See if we are running on a PAE Hypervisor
1336 	 */
1337 	{
1338 		xen_capabilities_info_t caps;
1339 
1340 		if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
1341 			dboot_panic("HYPERVISOR_xen_version(caps) failed");
1342 		caps[sizeof (caps) - 1] = 0;
1343 		if (prom_debug)
1344 			dboot_printf("xen capabilities %s\n", caps);
1345 		if (strstr(caps, "x86_32p") != NULL)
1346 			pae_support = 1;
1347 	}
1348 
1349 #endif	/* _BOOT_TARGET_amd64 */
1350 	{
1351 		xen_platform_parameters_t p;
1352 
1353 		if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
1354 			dboot_panic("HYPERVISOR_xen_version(parms) failed");
1355 		DBG(p.virt_start);
1356 		mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
1357 	}
1358 
1359 	/*
1360 	 * The hypervisor loads stuff starting at 1Gig
1361 	 */
1362 	mfn_base = ONE_GIG;
1363 	DBG(mfn_base);
1364 
1365 	/*
1366 	 * enable writable page table mode for the hypervisor
1367 	 */
1368 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1369 	    VMASST_TYPE_writable_pagetables) < 0)
1370 		dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
1371 
1372 	/*
1373 	 * check for NX support
1374 	 */
1375 	if (pae_support) {
1376 		uint32_t eax = 0x80000000;
1377 		uint32_t edx = get_cpuid_edx(&eax);
1378 
1379 		if (eax >= 0x80000001) {
1380 			eax = 0x80000001;
1381 			edx = get_cpuid_edx(&eax);
1382 			if (edx & CPUID_AMD_EDX_NX)
1383 				NX_support = 1;
1384 		}
1385 	}
1386 
1387 #if !defined(_BOOT_TARGET_amd64)
1388 
1389 	/*
1390 	 * The 32-bit hypervisor uses segmentation to protect itself from
1391 	 * guests. This means when a guest attempts to install a flat 4GB
1392 	 * code or data descriptor the 32-bit hypervisor will protect itself
1393 	 * by silently shrinking the segment such that if the guest attempts
1394 	 * any access where the hypervisor lives a #gp fault is generated.
1395 	 * The problem is that some applications expect a full 4GB flat
1396 	 * segment for their current thread pointer and will use negative
1397 	 * offset segment wrap around to access data. TLS support in linux
1398 	 * brand is one example of this.
1399 	 *
1400 	 * The 32-bit hypervisor can catch the #gp fault in these cases
1401 	 * and emulate the access without passing the #gp fault to the guest
1402 	 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
1403 	 * Seems like this should have been the default.
1404 	 * Either way, we want the hypervisor -- and not Solaris -- to deal
1405 	 * to deal with emulating these accesses.
1406 	 */
1407 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1408 	    VMASST_TYPE_4gb_segments) < 0)
1409 		dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
1410 #endif	/* !_BOOT_TARGET_amd64 */
1411 
1412 #else	/* __xpv */
1413 
1414 	/*
1415 	 * use cpuid to enable MMU features
1416 	 */
1417 	if (have_cpuid()) {
1418 		uint32_t eax, edx;
1419 
1420 		eax = 1;
1421 		edx = get_cpuid_edx(&eax);
1422 		if (edx & CPUID_INTC_EDX_PSE)
1423 			largepage_support = 1;
1424 		if (edx & CPUID_INTC_EDX_PGE)
1425 			pge_support = 1;
1426 		if (edx & CPUID_INTC_EDX_PAE)
1427 			pae_support = 1;
1428 
1429 		eax = 0x80000000;
1430 		edx = get_cpuid_edx(&eax);
1431 		if (eax >= 0x80000001) {
1432 			eax = 0x80000001;
1433 			edx = get_cpuid_edx(&eax);
1434 			if (edx & CPUID_AMD_EDX_LM)
1435 				amd64_support = 1;
1436 			if (edx & CPUID_AMD_EDX_NX)
1437 				NX_support = 1;
1438 		}
1439 	} else {
1440 		dboot_printf("cpuid not supported\n");
1441 	}
1442 #endif /* __xpv */
1443 
1444 
1445 #if defined(_BOOT_TARGET_amd64)
1446 	if (amd64_support == 0)
1447 		dboot_panic("long mode not supported, rebooting");
1448 	else if (pae_support == 0)
1449 		dboot_panic("long mode, but no PAE; rebooting");
1450 #else
1451 	/*
1452 	 * Allow the command line to over-ride use of PAE for 32 bit.
1453 	 */
1454 	if (strstr(cmdline, "disablePAE=true") != NULL) {
1455 		pae_support = 0;
1456 		NX_support = 0;
1457 		amd64_support = 0;
1458 	}
1459 #endif
1460 
1461 	/*
1462 	 * initialize the simple memory allocator
1463 	 */
1464 	init_mem_alloc();
1465 
1466 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
1467 	/*
1468 	 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
1469 	 */
1470 	if (max_mem < FOUR_GIG && NX_support == 0)
1471 		pae_support = 0;
1472 #endif
1473 
1474 	/*
1475 	 * configure mmu information
1476 	 */
1477 	if (pae_support) {
1478 		shift_amt = shift_amt_pae;
1479 		ptes_per_table = 512;
1480 		pte_size = 8;
1481 		lpagesize = TWO_MEG;
1482 #if defined(_BOOT_TARGET_amd64)
1483 		top_level = 3;
1484 #else
1485 		top_level = 2;
1486 #endif
1487 	} else {
1488 		pae_support = 0;
1489 		NX_support = 0;
1490 		shift_amt = shift_amt_nopae;
1491 		ptes_per_table = 1024;
1492 		pte_size = 4;
1493 		lpagesize = FOUR_MEG;
1494 		top_level = 1;
1495 	}
1496 
1497 	DBG(pge_support);
1498 	DBG(NX_support);
1499 	DBG(largepage_support);
1500 	DBG(amd64_support);
1501 	DBG(top_level);
1502 	DBG(pte_size);
1503 	DBG(ptes_per_table);
1504 	DBG(lpagesize);
1505 
1506 #if defined(__xpv)
1507 	ktext_phys = ONE_GIG;		/* from UNIX Mapfile */
1508 #else
1509 	ktext_phys = FOUR_MEG;		/* from UNIX Mapfile */
1510 #endif
1511 
1512 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
1513 	/*
1514 	 * For grub, copy kernel bits from the ELF64 file to final place.
1515 	 */
1516 	DBG_MSG("\nAllocating nucleus pages.\n");
1517 	ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
1518 	if (ktext_phys == 0)
1519 		dboot_panic("failed to allocate aligned kernel memory");
1520 	if (dboot_elfload64(mb_header.load_addr) != 0)
1521 		dboot_panic("failed to parse kernel ELF image, rebooting");
1522 #endif
1523 
1524 	DBG(ktext_phys);
1525 
1526 	/*
1527 	 * Allocate page tables.
1528 	 */
1529 	build_page_tables();
1530 
1531 	/*
1532 	 * return to assembly code to switch to running kernel
1533 	 */
1534 	entry_addr_low = (uint32_t)target_kernel_text;
1535 	DBG(entry_addr_low);
1536 	bi->bi_use_largepage = largepage_support;
1537 	bi->bi_use_pae = pae_support;
1538 	bi->bi_use_pge = pge_support;
1539 	bi->bi_use_nx = NX_support;
1540 
1541 #if defined(__xpv)
1542 
1543 	bi->bi_next_paddr = next_avail_addr - mfn_base;
1544 	DBG(bi->bi_next_paddr);
1545 	bi->bi_next_vaddr = (native_ptr_t)next_avail_addr;
1546 	DBG(bi->bi_next_vaddr);
1547 
1548 	/*
1549 	 * unmap unused pages in start area to make them available for DMA
1550 	 */
1551 	while (next_avail_addr < scratch_end) {
1552 		(void) HYPERVISOR_update_va_mapping(next_avail_addr,
1553 		    0, UVMF_INVLPG | UVMF_LOCAL);
1554 		next_avail_addr += MMU_PAGESIZE;
1555 	}
1556 
1557 	bi->bi_xen_start_info = (uintptr_t)xen_info;
1558 	DBG((uintptr_t)HYPERVISOR_shared_info);
1559 	bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
1560 	bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
1561 
1562 #else /* __xpv */
1563 
1564 	bi->bi_next_paddr = next_avail_addr;
1565 	DBG(bi->bi_next_paddr);
1566 	bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
1567 	DBG(bi->bi_next_vaddr);
1568 	bi->bi_mb_info = (uintptr_t)mb_info;
1569 	bi->bi_top_page_table = (uintptr_t)top_page_table;
1570 
1571 #endif /* __xpv */
1572 
1573 	bi->bi_kseg_size = FOUR_MEG;
1574 	DBG(bi->bi_kseg_size);
1575 
1576 #ifndef __xpv
1577 	if (map_debug)
1578 		dump_tables();
1579 #endif
1580 
1581 	DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
1582 }
1583