xref: /titanic_51/usr/src/uts/i86pc/dboot/dboot_startkern.c (revision dd850934386c395d7dd28457dab735df80de144c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 
28 #include <sys/types.h>
29 #include <sys/machparam.h>
30 #include <sys/x86_archext.h>
31 #include <sys/systm.h>
32 #include <sys/mach_mmu.h>
33 #include <sys/multiboot.h>
34 
35 #if defined(__xpv)
36 
37 #include <sys/hypervisor.h>
38 uintptr_t xen_virt_start;
39 pfn_t *mfn_to_pfn_mapping;
40 
41 #else /* !__xpv */
42 
43 extern multiboot_header_t mb_header;
44 extern int have_cpuid(void);
45 
46 #endif /* !__xpv */
47 
48 #include <sys/inttypes.h>
49 #include <sys/bootinfo.h>
50 #include <sys/mach_mmu.h>
51 #include <sys/boot_console.h>
52 
53 #include "dboot_asm.h"
54 #include "dboot_printf.h"
55 #include "dboot_xboot.h"
56 #include "dboot_elfload.h"
57 
58 /*
59  * This file contains code that runs to transition us from either a multiboot
60  * compliant loader (32 bit non-paging) or a XPV domain loader to
61  * regular kernel execution. Its task is to setup the kernel memory image
62  * and page tables.
63  *
64  * The code executes as:
65  *	- 32 bits under GRUB (for 32 or 64 bit Solaris)
66  * 	- a 32 bit program for the 32-bit PV hypervisor
67  *	- a 64 bit program for the 64-bit PV hypervisor (at least for now)
68  *
69  * Under the PV hypervisor, we must create mappings for any memory beyond the
70  * initial start of day allocation (such as the kernel itself).
71  *
72  * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
73  * Since we are running in real mode, so all such memory is accessible.
74  */
75 
76 /*
77  * Standard bits used in PTE (page level) and PTP (internal levels)
78  */
79 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
80 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
81 
82 /*
83  * This is the target addresses (physical) where the kernel text and data
84  * nucleus pages will be unpacked. On the hypervisor this is actually a
85  * virtual address.
86  */
87 paddr_t ktext_phys;
88 uint32_t ksize = 2 * FOUR_MEG;	/* kernel nucleus is 8Meg */
89 
90 static uint64_t target_kernel_text;	/* value to use for KERNEL_TEXT */
91 
92 /*
93  * The stack is setup in assembler before entering startup_kernel()
94  */
95 char stack_space[STACK_SIZE];
96 
97 /*
98  * Used to track physical memory allocation
99  */
100 static paddr_t next_avail_addr = 0;
101 
102 #if defined(__xpv)
103 /*
104  * Additional information needed for hypervisor memory allocation.
105  * Only memory up to scratch_end is mapped by page tables.
106  * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
107  * to derive a pfn from a pointer, you subtract mfn_base.
108  */
109 
110 static paddr_t scratch_end = 0;	/* we can't write all of mem here */
111 static paddr_t mfn_base;		/* addr corresponding to mfn_list[0] */
112 start_info_t *xen_info;
113 
114 #else	/* __xpv */
115 
116 /*
117  * If on the metal, then we have a multiboot loader.
118  */
119 multiboot_info_t *mb_info;
120 
121 #endif	/* __xpv */
122 
123 /*
124  * This contains information passed to the kernel
125  */
126 struct xboot_info boot_info[2];	/* extra space to fix alignement for amd64 */
127 struct xboot_info *bi;
128 
129 /*
130  * Page table and memory stuff.
131  */
132 static paddr_t max_mem;			/* maximum memory address */
133 
134 /*
135  * Information about processor MMU
136  */
137 int amd64_support = 0;
138 int largepage_support = 0;
139 int pae_support = 0;
140 int pge_support = 0;
141 int NX_support = 0;
142 
143 /*
144  * Low 32 bits of kernel entry address passed back to assembler.
145  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
146  */
147 uint32_t entry_addr_low;
148 
149 /*
150  * Memlists for the kernel. We shouldn't need a lot of these.
151  */
152 #define	MAX_MEMLIST (50)
153 struct boot_memlist memlists[MAX_MEMLIST];
154 uint_t memlists_used = 0;
155 struct boot_memlist pcimemlists[MAX_MEMLIST];
156 uint_t pcimemlists_used = 0;
157 
158 #define	MAX_MODULES (10)
159 struct boot_modules modules[MAX_MODULES];
160 uint_t modules_used = 0;
161 
162 /*
163  * Debugging macros
164  */
165 uint_t prom_debug = 0;
166 uint_t map_debug = 0;
167 
168 /*
169  * Either hypervisor-specific or grub-specific code builds the initial
170  * memlists. This code does the sort/merge/link for final use.
171  */
172 static void
173 sort_physinstall(void)
174 {
175 	int i;
176 #if !defined(__xpv)
177 	int j;
178 	struct boot_memlist tmp;
179 
180 	/*
181 	 * Now sort the memlists, in case they weren't in order.
182 	 * Yeah, this is a bubble sort; small, simple and easy to get right.
183 	 */
184 	DBG_MSG("Sorting phys-installed list\n");
185 	for (j = memlists_used - 1; j > 0; --j) {
186 		for (i = 0; i < j; ++i) {
187 			if (memlists[i].addr < memlists[i + 1].addr)
188 				continue;
189 			tmp = memlists[i];
190 			memlists[i] = memlists[i + 1];
191 			memlists[i + 1] = tmp;
192 		}
193 	}
194 
195 	/*
196 	 * Merge any memlists that don't have holes between them.
197 	 */
198 	for (i = 0; i <= memlists_used - 1; ++i) {
199 		if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
200 			continue;
201 
202 		if (prom_debug)
203 			dboot_printf(
204 			    "merging mem segs %" PRIx64 "...%" PRIx64
205 			    " w/ %" PRIx64 "...%" PRIx64 "\n",
206 			    memlists[i].addr,
207 			    memlists[i].addr + memlists[i].size,
208 			    memlists[i + 1].addr,
209 			    memlists[i + 1].addr + memlists[i + 1].size);
210 
211 		memlists[i].size += memlists[i + 1].size;
212 		for (j = i + 1; j < memlists_used - 1; ++j)
213 			memlists[j] = memlists[j + 1];
214 		--memlists_used;
215 		DBG(memlists_used);
216 		--i;	/* after merging we need to reexamine, so do this */
217 	}
218 #endif	/* __xpv */
219 
220 	if (prom_debug) {
221 		dboot_printf("\nFinal memlists:\n");
222 		for (i = 0; i < memlists_used; ++i) {
223 			dboot_printf("\t%d: addr=%" PRIx64 " size=%"
224 			    PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
225 		}
226 	}
227 
228 	/*
229 	 * link together the memlists with native size pointers
230 	 */
231 	memlists[0].next = 0;
232 	memlists[0].prev = 0;
233 	for (i = 1; i < memlists_used; ++i) {
234 		memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
235 		memlists[i].next = 0;
236 		memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
237 	}
238 	bi->bi_phys_install = (native_ptr_t)memlists;
239 	DBG(bi->bi_phys_install);
240 }
241 
242 #if defined(__xpv)
243 
244 /*
245  * halt on the hypervisor after a delay to drain console output
246  */
247 void
248 dboot_halt(void)
249 {
250 	uint_t i = 10000;
251 
252 	while (--i)
253 		HYPERVISOR_yield();
254 	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
255 }
256 
257 /*
258  * From a machine address, find the corresponding pseudo-physical address.
259  * Pseudo-physical address are contiguous and run from mfn_base in each VM.
260  * Machine addresses are the real underlying hardware addresses.
261  * These are needed for page table entries. Note that this routine is
262  * poorly protected. A bad value of "ma" will cause a page fault.
263  */
264 paddr_t
265 ma_to_pa(maddr_t ma)
266 {
267 	ulong_t pgoff = ma & MMU_PAGEOFFSET;
268 	ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
269 	paddr_t pa;
270 
271 	if (pfn >= xen_info->nr_pages)
272 		return (-(paddr_t)1);
273 	pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
274 #ifdef DEBUG
275 	if (ma != pa_to_ma(pa))
276 		dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
277 		    "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
278 #endif
279 	return (pa);
280 }
281 
282 /*
283  * From a pseudo-physical address, find the corresponding machine address.
284  */
285 maddr_t
286 pa_to_ma(paddr_t pa)
287 {
288 	pfn_t pfn;
289 	ulong_t mfn;
290 
291 	pfn = mmu_btop(pa - mfn_base);
292 	if (pa < mfn_base || pfn >= xen_info->nr_pages)
293 		dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
294 	mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
295 #ifdef DEBUG
296 	if (mfn_to_pfn_mapping[mfn] != pfn)
297 		dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
298 		    pfn, mfn, mfn_to_pfn_mapping[mfn]);
299 #endif
300 	return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
301 }
302 
303 #endif	/* __xpv */
304 
305 x86pte_t
306 get_pteval(paddr_t table, uint_t index)
307 {
308 	if (pae_support)
309 		return (((x86pte_t *)(uintptr_t)table)[index]);
310 	return (((x86pte32_t *)(uintptr_t)table)[index]);
311 }
312 
313 /*ARGSUSED*/
314 void
315 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
316 {
317 #ifdef __xpv
318 	mmu_update_t t;
319 	maddr_t mtable = pa_to_ma(table);
320 	int retcnt;
321 
322 	t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
323 	t.val = pteval;
324 	if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
325 		dboot_panic("HYPERVISOR_mmu_update() failed");
326 #else /* __xpv */
327 	uintptr_t tab_addr = (uintptr_t)table;
328 
329 	if (pae_support)
330 		((x86pte_t *)tab_addr)[index] = pteval;
331 	else
332 		((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
333 	if (level == top_level && level == 2)
334 		reload_cr3();
335 #endif /* __xpv */
336 }
337 
338 paddr_t
339 make_ptable(x86pte_t *pteval, uint_t level)
340 {
341 	paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
342 
343 	if (level == top_level && level == 2)
344 		*pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
345 	else
346 		*pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
347 
348 #ifdef __xpv
349 	/* Remove write permission to the new page table. */
350 	if (HYPERVISOR_update_va_mapping(new_table,
351 	    *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
352 		dboot_panic("HYP_update_va_mapping error");
353 #endif
354 
355 	if (map_debug)
356 		dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
357 		    PRIx64 "\n", level, (ulong_t)new_table, *pteval);
358 	return (new_table);
359 }
360 
361 x86pte_t *
362 map_pte(paddr_t table, uint_t index)
363 {
364 	return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
365 }
366 
367 /*
368  * dump out the contents of page tables...
369  */
370 static void
371 dump_tables(void)
372 {
373 	uint_t save_index[4];	/* for recursion */
374 	char *save_table[4];	/* for recursion */
375 	uint_t	l;
376 	uint64_t va;
377 	uint64_t pgsize;
378 	int index;
379 	int i;
380 	x86pte_t pteval;
381 	char *table;
382 	static char *tablist = "\t\t\t";
383 	char *tabs = tablist + 3 - top_level;
384 	uint_t pa, pa1;
385 #if !defined(__xpv)
386 #define	maddr_t paddr_t
387 #endif /* !__xpv */
388 
389 	dboot_printf("Finished pagetables:\n");
390 	table = (char *)(uintptr_t)top_page_table;
391 	l = top_level;
392 	va = 0;
393 	for (index = 0; index < ptes_per_table; ++index) {
394 		pgsize = 1ull << shift_amt[l];
395 		if (pae_support)
396 			pteval = ((x86pte_t *)table)[index];
397 		else
398 			pteval = ((x86pte32_t *)table)[index];
399 		if (pteval == 0)
400 			goto next_entry;
401 
402 		dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
403 		    tabs + l, table, index, (uint64_t)pteval, va);
404 		pa = ma_to_pa(pteval & MMU_PAGEMASK);
405 		dboot_printf(" physaddr=%x\n", pa);
406 
407 		/*
408 		 * Don't try to walk hypervisor private pagetables
409 		 */
410 		if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
411 			save_table[l] = table;
412 			save_index[l] = index;
413 			--l;
414 			index = -1;
415 			table = (char *)(uintptr_t)
416 			    ma_to_pa(pteval & MMU_PAGEMASK);
417 			goto recursion;
418 		}
419 
420 		/*
421 		 * shorten dump for consecutive mappings
422 		 */
423 		for (i = 1; index + i < ptes_per_table; ++i) {
424 			if (pae_support)
425 				pteval = ((x86pte_t *)table)[index + i];
426 			else
427 				pteval = ((x86pte32_t *)table)[index + i];
428 			if (pteval == 0)
429 				break;
430 			pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
431 			if (pa1 != pa + i * pgsize)
432 				break;
433 		}
434 		if (i > 2) {
435 			dboot_printf("%s...\n", tabs + l);
436 			va += pgsize * (i - 2);
437 			index += i - 2;
438 		}
439 next_entry:
440 		va += pgsize;
441 		if (l == 3 && index == 256)	/* VA hole */
442 			va = 0xffff800000000000ull;
443 recursion:
444 		;
445 	}
446 	if (l < top_level) {
447 		++l;
448 		index = save_index[l];
449 		table = save_table[l];
450 		goto recursion;
451 	}
452 }
453 
454 /*
455  * Add a mapping for the machine page at the given virtual address.
456  */
457 static void
458 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
459 {
460 	x86pte_t *ptep;
461 	x86pte_t pteval;
462 
463 	pteval = ma | pte_bits;
464 	if (level > 0)
465 		pteval |= PT_PAGESIZE;
466 	if (va >= target_kernel_text && pge_support)
467 		pteval |= PT_GLOBAL;
468 
469 	if (map_debug && ma != va)
470 		dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
471 		    " pte=0x%" PRIx64 " l=%d\n",
472 		    (uint64_t)ma, (uint64_t)va, pteval, level);
473 
474 #if defined(__xpv)
475 	/*
476 	 * see if we can avoid find_pte() on the hypervisor
477 	 */
478 	if (HYPERVISOR_update_va_mapping(va, pteval,
479 	    UVMF_INVLPG | UVMF_LOCAL) == 0)
480 		return;
481 #endif
482 
483 	/*
484 	 * Find the pte that will map this address. This creates any
485 	 * missing intermediate level page tables
486 	 */
487 	ptep = find_pte(va, NULL, level, 0);
488 
489 	/*
490 	 * When paravirtualized, we must use hypervisor calls to modify the
491 	 * PTE, since paging is active. On real hardware we just write to
492 	 * the pagetables which aren't in use yet.
493 	 */
494 #if defined(__xpv)
495 	ptep = ptep;	/* shut lint up */
496 	if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
497 		dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
498 		    " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
499 		    (uint64_t)va, level, (uint64_t)ma, pteval);
500 #else
501 	if (va < 1024 * 1024)
502 		pteval |= PT_NOCACHE;		/* for video RAM */
503 	if (pae_support)
504 		*ptep = pteval;
505 	else
506 		*((x86pte32_t *)ptep) = (x86pte32_t)pteval;
507 #endif
508 }
509 
510 /*
511  * Add a mapping for the physical page at the given virtual address.
512  */
513 static void
514 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
515 {
516 	map_ma_at_va(pa_to_ma(pa), va, level);
517 }
518 
519 /*
520  * This is called to remove start..end from the
521  * possible range of PCI addresses.
522  */
523 const uint64_t pci_lo_limit = 0x00100000ul;
524 const uint64_t pci_hi_limit = 0xfff00000ul;
525 static void
526 exclude_from_pci(uint64_t start, uint64_t end)
527 {
528 	int i;
529 	int j;
530 	struct boot_memlist *ml;
531 
532 	for (i = 0; i < pcimemlists_used; ++i) {
533 		ml = &pcimemlists[i];
534 
535 		/* delete the entire range? */
536 		if (start <= ml->addr && ml->addr + ml->size <= end) {
537 			--pcimemlists_used;
538 			for (j = i; j < pcimemlists_used; ++j)
539 				pcimemlists[j] = pcimemlists[j + 1];
540 			--i;	/* to revisit the new one at this index */
541 		}
542 
543 		/* split a range? */
544 		else if (ml->addr < start && end < ml->addr + ml->size) {
545 
546 			++pcimemlists_used;
547 			if (pcimemlists_used > MAX_MEMLIST)
548 				dboot_panic("too many pcimemlists");
549 
550 			for (j = pcimemlists_used - 1; j > i; --j)
551 				pcimemlists[j] = pcimemlists[j - 1];
552 			ml->size = start - ml->addr;
553 
554 			++ml;
555 			ml->size = (ml->addr + ml->size) - end;
556 			ml->addr = end;
557 			++i;	/* skip on to next one */
558 		}
559 
560 		/* cut memory off the start? */
561 		else if (ml->addr < end && end < ml->addr + ml->size) {
562 			ml->size -= end - ml->addr;
563 			ml->addr = end;
564 		}
565 
566 		/* cut memory off the end? */
567 		else if (ml->addr <= start && start < ml->addr + ml->size) {
568 			ml->size = start - ml->addr;
569 		}
570 	}
571 }
572 
573 /*
574  * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
575  * definition in Xen source.
576  */
577 #ifdef __xpv
578 typedef struct {
579 	uint32_t	base_addr_low;
580 	uint32_t	base_addr_high;
581 	uint32_t	length_low;
582 	uint32_t	length_high;
583 	uint32_t	type;
584 } mmap_t;
585 #else
586 typedef mb_memory_map_t mmap_t;
587 #endif
588 
589 static void
590 build_pcimemlists(mmap_t *mem, int num)
591 {
592 	mmap_t *mmap;
593 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
594 	uint64_t start;
595 	uint64_t end;
596 	int i;
597 
598 	/*
599 	 * initialize
600 	 */
601 	pcimemlists[0].addr = pci_lo_limit;
602 	pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
603 	pcimemlists_used = 1;
604 
605 	/*
606 	 * Fill in PCI memlists.
607 	 */
608 	for (mmap = mem, i = 0; i < num; ++i, ++mmap) {
609 		start = ((uint64_t)mmap->base_addr_high << 32) +
610 		    mmap->base_addr_low;
611 		end = start + ((uint64_t)mmap->length_high << 32) +
612 		    mmap->length_low;
613 
614 		if (prom_debug)
615 			dboot_printf("\ttype: %d %" PRIx64 "..%"
616 			    PRIx64 "\n", mmap->type, start, end);
617 
618 		/*
619 		 * page align start and end
620 		 */
621 		start = (start + page_offset) & ~page_offset;
622 		end &= ~page_offset;
623 		if (end <= start)
624 			continue;
625 
626 		exclude_from_pci(start, end);
627 	}
628 
629 	/*
630 	 * Finish off the pcimemlist
631 	 */
632 	if (prom_debug) {
633 		for (i = 0; i < pcimemlists_used; ++i) {
634 			dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
635 			    PRIx64 "\n", pcimemlists[i].addr,
636 			    pcimemlists[i].addr + pcimemlists[i].size);
637 		}
638 	}
639 	pcimemlists[0].next = 0;
640 	pcimemlists[0].prev = 0;
641 	for (i = 1; i < pcimemlists_used; ++i) {
642 		pcimemlists[i].prev =
643 		    (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
644 		pcimemlists[i].next = 0;
645 		pcimemlists[i - 1].next =
646 		    (native_ptr_t)(uintptr_t)(pcimemlists + i);
647 	}
648 	bi->bi_pcimem = (native_ptr_t)pcimemlists;
649 	DBG(bi->bi_pcimem);
650 }
651 
652 #if defined(__xpv)
653 /*
654  * Initialize memory allocator stuff from hypervisor-supplied start info.
655  *
656  * There is 512KB of scratch area after the boot stack page.
657  * We'll use that for everything except the kernel nucleus pages which are too
658  * big to fit there and are allocated last anyway.
659  */
660 #define	MAXMAPS	100
661 static mmap_t map_buffer[MAXMAPS];
662 static void
663 init_mem_alloc(void)
664 {
665 	int	local;	/* variables needed to find start region */
666 	paddr_t	scratch_start;
667 	xen_memory_map_t map;
668 
669 	DBG_MSG("Entered init_mem_alloc()\n");
670 
671 	/*
672 	 * Free memory follows the stack. There's at least 512KB of scratch
673 	 * space, rounded up to at least 2Mb alignment.  That should be enough
674 	 * for the page tables we'll need to build.  The nucleus memory is
675 	 * allocated last and will be outside the addressible range.  We'll
676 	 * switch to new page tables before we unpack the kernel
677 	 */
678 	scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
679 	DBG(scratch_start);
680 	scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
681 	DBG(scratch_end);
682 
683 	/*
684 	 * For paranoia, leave some space between hypervisor data and ours.
685 	 * Use 500 instead of 512.
686 	 */
687 	next_avail_addr = scratch_end - 500 * 1024;
688 	DBG(next_avail_addr);
689 
690 	/*
691 	 * The domain builder gives us at most 1 module
692 	 */
693 	DBG(xen_info->mod_len);
694 	if (xen_info->mod_len > 0) {
695 		DBG(xen_info->mod_start);
696 		modules[0].bm_addr = xen_info->mod_start;
697 		modules[0].bm_size = xen_info->mod_len;
698 		bi->bi_module_cnt = 1;
699 		bi->bi_modules = (native_ptr_t)modules;
700 	} else {
701 		bi->bi_module_cnt = 0;
702 		bi->bi_modules = NULL;
703 	}
704 	DBG(bi->bi_module_cnt);
705 	DBG(bi->bi_modules);
706 
707 	DBG(xen_info->mfn_list);
708 	DBG(xen_info->nr_pages);
709 	max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
710 	DBG(max_mem);
711 
712 	/*
713 	 * Using pseudo-physical addresses, so only 1 memlist element
714 	 */
715 	memlists[0].addr = 0;
716 	DBG(memlists[0].addr);
717 	memlists[0].size = max_mem;
718 	DBG(memlists[0].size);
719 	memlists_used = 1;
720 	DBG(memlists_used);
721 
722 	/*
723 	 * finish building physinstall list
724 	 */
725 	sort_physinstall();
726 
727 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
728 		/*
729 		 * build PCI Memory list
730 		 */
731 		map.nr_entries = MAXMAPS;
732 		/*LINTED: constant in conditional context*/
733 		set_xen_guest_handle(map.buffer, map_buffer);
734 		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
735 			dboot_panic("getting XENMEM_machine_memory_map failed");
736 		build_pcimemlists(map_buffer, map.nr_entries);
737 	}
738 }
739 
740 #else	/* !__xpv */
741 
742 /*
743  * During memory allocation, find the highest address not used yet.
744  */
745 static void
746 check_higher(paddr_t a)
747 {
748 	if (a < next_avail_addr)
749 		return;
750 	next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
751 	DBG(next_avail_addr);
752 }
753 
754 /*
755  * Walk through the module information finding the last used address.
756  * The first available address will become the top level page table.
757  *
758  * We then build the phys_install memlist from the multiboot information.
759  */
760 static void
761 init_mem_alloc(void)
762 {
763 	mb_memory_map_t *mmap;
764 	mb_module_t *mod;
765 	uint64_t start;
766 	uint64_t end;
767 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
768 	extern char _end[];
769 	int i;
770 
771 	DBG_MSG("Entered init_mem_alloc()\n");
772 	DBG((uintptr_t)mb_info);
773 
774 	/*
775 	 * search the modules to find the last used address
776 	 * we'll build the module list while we're walking through here
777 	 */
778 	DBG_MSG("\nFinding Modules\n");
779 	check_higher((paddr_t)&_end);
780 	for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
781 	    i < mb_info->mods_count;
782 	    ++mod, ++i) {
783 		if (prom_debug) {
784 			dboot_printf("\tmodule #%d: %s at: 0x%lx, len 0x%lx\n",
785 			    i, (char *)(mod->mod_name),
786 			    (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
787 		}
788 		modules[i].bm_addr = mod->mod_start;
789 		modules[i].bm_size = mod->mod_end;
790 
791 		check_higher(mod->mod_end);
792 	}
793 	bi->bi_modules = (native_ptr_t)modules;
794 	DBG(bi->bi_modules);
795 	bi->bi_module_cnt = mb_info->mods_count;
796 	DBG(bi->bi_module_cnt);
797 
798 	/*
799 	 * Walk through the memory map from multiboot and build our memlist
800 	 * structures. Note these will have native format pointers.
801 	 */
802 	DBG_MSG("\nFinding Memory Map\n");
803 	DBG(mb_info->flags);
804 	max_mem = 0;
805 	if (mb_info->flags & 0x40) {
806 		int cnt = 0;
807 
808 		DBG(mb_info->mmap_addr);
809 		DBG(mb_info->mmap_length);
810 		check_higher(mb_info->mmap_addr + mb_info->mmap_length);
811 
812 		for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
813 		    (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length;
814 		    mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size
815 		    + sizeof (mmap->size))) {
816 			++cnt;
817 			start = ((uint64_t)mmap->base_addr_high << 32) +
818 			    mmap->base_addr_low;
819 			end = start + ((uint64_t)mmap->length_high << 32) +
820 			    mmap->length_low;
821 
822 			if (prom_debug)
823 				dboot_printf("\ttype: %d %" PRIx64 "..%"
824 				    PRIx64 "\n", mmap->type, start, end);
825 
826 			/*
827 			 * page align start and end
828 			 */
829 			start = (start + page_offset) & ~page_offset;
830 			end &= ~page_offset;
831 			if (end <= start)
832 				continue;
833 
834 			/*
835 			 * only type 1 is usable RAM
836 			 */
837 			if (mmap->type != 1)
838 				continue;
839 
840 			if (end > max_mem)
841 				max_mem = end;
842 
843 			memlists[memlists_used].addr = start;
844 			memlists[memlists_used].size = end - start;
845 			++memlists_used;
846 			if (memlists_used > MAX_MEMLIST)
847 				dboot_panic("too many memlists");
848 		}
849 		build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt);
850 	} else if (mb_info->flags & 0x01) {
851 		DBG(mb_info->mem_lower);
852 		memlists[memlists_used].addr = 0;
853 		memlists[memlists_used].size = mb_info->mem_lower * 1024;
854 		++memlists_used;
855 		DBG(mb_info->mem_upper);
856 		memlists[memlists_used].addr = 1024 * 1024;
857 		memlists[memlists_used].size = mb_info->mem_upper * 1024;
858 		++memlists_used;
859 
860 		/*
861 		 * Old platform - assume I/O space at the end of memory.
862 		 */
863 		pcimemlists[0].addr =
864 		    (mb_info->mem_upper * 1024) + (1024 * 1024);
865 		pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
866 		pcimemlists[0].next = 0;
867 		pcimemlists[0].prev = 0;
868 		bi->bi_pcimem = (native_ptr_t)pcimemlists;
869 		DBG(bi->bi_pcimem);
870 	} else {
871 		dboot_panic("No memory info from boot loader!!!");
872 	}
873 
874 	check_higher(bi->bi_cmdline);
875 
876 	/*
877 	 * finish processing the physinstall list
878 	 */
879 	sort_physinstall();
880 }
881 #endif /* !__xpv */
882 
883 /*
884  * Simple memory allocator, allocates aligned physical memory.
885  * Note that startup_kernel() only allocates memory, never frees.
886  * Memory usage just grows in an upward direction.
887  */
888 static void *
889 do_mem_alloc(uint32_t size, uint32_t align)
890 {
891 	uint_t i;
892 	uint64_t best;
893 	uint64_t start;
894 	uint64_t end;
895 
896 	/*
897 	 * make sure size is a multiple of pagesize
898 	 */
899 	size = RNDUP(size, MMU_PAGESIZE);
900 	next_avail_addr = RNDUP(next_avail_addr, align);
901 
902 	/*
903 	 * XXPV fixme joe
904 	 *
905 	 * a really large bootarchive that causes you to run out of memory
906 	 * may cause this to blow up
907 	 */
908 	/* LINTED E_UNEXPECTED_UINT_PROMOTION */
909 	best = (uint64_t)-size;
910 	for (i = 0; i < memlists_used; ++i) {
911 		start = memlists[i].addr;
912 #if defined(__xpv)
913 		start += mfn_base;
914 #endif
915 		end = start + memlists[i].size;
916 
917 		/*
918 		 * did we find the desired address?
919 		 */
920 		if (start <= next_avail_addr && next_avail_addr + size <= end) {
921 			best = next_avail_addr;
922 			goto done;
923 		}
924 
925 		/*
926 		 * if not is this address the best so far?
927 		 */
928 		if (start > next_avail_addr && start < best &&
929 		    RNDUP(start, align) + size <= end)
930 			best = RNDUP(start, align);
931 	}
932 
933 	/*
934 	 * We didn't find exactly the address we wanted, due to going off the
935 	 * end of a memory region. Return the best found memory address.
936 	 */
937 done:
938 	next_avail_addr = best + size;
939 #if defined(__xpv)
940 	if (next_avail_addr > scratch_end)
941 		dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
942 		    "0x%lx", (ulong_t)next_avail_addr,
943 		    (ulong_t)scratch_end);
944 #endif
945 	(void) memset((void *)(uintptr_t)best, 0, size);
946 	return ((void *)(uintptr_t)best);
947 }
948 
949 void *
950 mem_alloc(uint32_t size)
951 {
952 	return (do_mem_alloc(size, MMU_PAGESIZE));
953 }
954 
955 
956 /*
957  * Build page tables to map all of memory used so far as well as the kernel.
958  */
959 static void
960 build_page_tables(void)
961 {
962 	uint32_t psize;
963 	uint32_t level;
964 	uint32_t off;
965 	uint64_t start;
966 #if !defined(__xpv)
967 	uint32_t i;
968 	uint64_t end;
969 #endif	/* __xpv */
970 
971 	/*
972 	 * If we're on metal, we need to create the top level pagetable.
973 	 */
974 #if defined(__xpv)
975 	top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
976 #else /* __xpv */
977 	top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
978 #endif /* __xpv */
979 	DBG((uintptr_t)top_page_table);
980 
981 	/*
982 	 * Determine if we'll use large mappings for kernel, then map it.
983 	 */
984 	if (largepage_support) {
985 		psize = lpagesize;
986 		level = 1;
987 	} else {
988 		psize = MMU_PAGESIZE;
989 		level = 0;
990 	}
991 
992 	DBG_MSG("Mapping kernel\n");
993 	DBG(ktext_phys);
994 	DBG(target_kernel_text);
995 	DBG(ksize);
996 	DBG(psize);
997 	for (off = 0; off < ksize; off += psize)
998 		map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
999 
1000 	/*
1001 	 * The kernel will need a 1 page window to work with page tables
1002 	 */
1003 	bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE);
1004 	DBG(bi->bi_pt_window);
1005 	bi->bi_pte_to_pt_window =
1006 	    (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1007 	DBG(bi->bi_pte_to_pt_window);
1008 
1009 #if defined(__xpv)
1010 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1011 		/* If this is a domU we're done. */
1012 		DBG_MSG("\nPage tables constructed\n");
1013 		return;
1014 	}
1015 #endif /* __xpv */
1016 
1017 	/*
1018 	 * We need 1:1 mappings for the lower 1M of memory to access
1019 	 * BIOS tables used by a couple of drivers during boot.
1020 	 *
1021 	 * The following code works because our simple memory allocator
1022 	 * only grows usage in an upwards direction.
1023 	 *
1024 	 * Note that by this point in boot some mappings for low memory
1025 	 * may already exist because we've already accessed device in low
1026 	 * memory.  (Specifically the video frame buffer and keyboard
1027 	 * status ports.)  If we're booting on raw hardware then GRUB
1028 	 * created these mappings for us.  If we're booting under a
1029 	 * hypervisor then we went ahead and remapped these devices into
1030 	 * memory allocated within dboot itself.
1031 	 */
1032 	if (map_debug)
1033 		dboot_printf("1:1 map pa=0..1Meg\n");
1034 	for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
1035 #if defined(__xpv)
1036 		map_ma_at_va(start, start, 0);
1037 #else /* __xpv */
1038 		map_pa_at_va(start, start, 0);
1039 #endif /* __xpv */
1040 	}
1041 
1042 #if !defined(__xpv)
1043 	for (i = 0; i < memlists_used; ++i) {
1044 		start = memlists[i].addr;
1045 
1046 		end = start + memlists[i].size;
1047 
1048 		if (map_debug)
1049 			dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
1050 			    start, end);
1051 		while (start < end && start < next_avail_addr) {
1052 			map_pa_at_va(start, start, 0);
1053 			start += MMU_PAGESIZE;
1054 		}
1055 	}
1056 #endif /* !__xpv */
1057 
1058 	DBG_MSG("\nPage tables constructed\n");
1059 }
1060 
1061 #define	NO_MULTIBOOT	\
1062 "multiboot is no longer used to boot the Solaris Operating System.\n\
1063 The grub entry should be changed to:\n\
1064 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
1065 module$ /platform/i86pc/$ISADIR/boot_archive\n\
1066 See http://www.sun.com/msg/SUNOS-8000-AK for details.\n"
1067 
1068 /*
1069  * startup_kernel has a pretty simple job. It builds pagetables which reflect
1070  * 1:1 mappings for all memory in use. It then also adds mappings for
1071  * the kernel nucleus at virtual address of target_kernel_text using large page
1072  * mappings. The page table pages are also accessible at 1:1 mapped
1073  * virtual addresses.
1074  */
1075 /*ARGSUSED*/
1076 void
1077 startup_kernel(void)
1078 {
1079 	char *cmdline;
1080 	uintptr_t addr;
1081 #if defined(__xpv)
1082 	physdev_set_iopl_t set_iopl;
1083 #endif /* __xpv */
1084 
1085 	/*
1086 	 * At this point we are executing in a 32 bit real mode.
1087 	 */
1088 #if defined(__xpv)
1089 	cmdline = (char *)xen_info->cmd_line;
1090 #else /* __xpv */
1091 	cmdline = (char *)mb_info->cmdline;
1092 #endif /* __xpv */
1093 
1094 	prom_debug = (strstr(cmdline, "prom_debug") != NULL);
1095 	map_debug = (strstr(cmdline, "map_debug") != NULL);
1096 
1097 #if defined(__xpv)
1098 	/*
1099 	 * For dom0, before we initialize the console subsystem we'll
1100 	 * need to enable io operations, so set I/O priveldge level to 1.
1101 	 */
1102 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1103 		set_iopl.iopl = 1;
1104 		(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1105 	}
1106 #endif /* __xpv */
1107 
1108 	bcons_init(cmdline);
1109 	DBG_MSG("\n\nSolaris prekernel set: ");
1110 	DBG_MSG(cmdline);
1111 	DBG_MSG("\n");
1112 
1113 	if (strstr(cmdline, "multiboot") != NULL) {
1114 		dboot_panic(NO_MULTIBOOT);
1115 	}
1116 
1117 	/*
1118 	 * boot info must be 16 byte aligned for 64 bit kernel ABI
1119 	 */
1120 	addr = (uintptr_t)boot_info;
1121 	addr = (addr + 0xf) & ~0xf;
1122 	bi = (struct xboot_info *)addr;
1123 	DBG((uintptr_t)bi);
1124 	bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
1125 
1126 	/*
1127 	 * Need correct target_kernel_text value
1128 	 */
1129 #if defined(_BOOT_TARGET_amd64)
1130 	target_kernel_text = KERNEL_TEXT_amd64;
1131 #elif defined(__xpv)
1132 	target_kernel_text = KERNEL_TEXT_i386_xpv;
1133 #else
1134 	target_kernel_text = KERNEL_TEXT_i386;
1135 #endif
1136 	DBG(target_kernel_text);
1137 
1138 #if defined(__xpv)
1139 
1140 	/*
1141 	 * XXPV	Derive this stuff from CPUID / what the hypervisor has enabled
1142 	 */
1143 
1144 #if defined(_BOOT_TARGET_amd64)
1145 	/*
1146 	 * 64-bit hypervisor.
1147 	 */
1148 	amd64_support = 1;
1149 	pae_support = 1;
1150 
1151 #else	/* _BOOT_TARGET_amd64 */
1152 
1153 	/*
1154 	 * See if we are running on a PAE Hypervisor
1155 	 */
1156 	{
1157 		xen_capabilities_info_t caps;
1158 
1159 		if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
1160 			dboot_panic("HYPERVISOR_xen_version(caps) failed");
1161 		caps[sizeof (caps) - 1] = 0;
1162 		if (prom_debug)
1163 			dboot_printf("xen capabilities %s\n", caps);
1164 		if (strstr(caps, "x86_32p") != NULL)
1165 			pae_support = 1;
1166 	}
1167 
1168 #endif	/* _BOOT_TARGET_amd64 */
1169 	{
1170 		xen_platform_parameters_t p;
1171 
1172 		if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
1173 			dboot_panic("HYPERVISOR_xen_version(parms) failed");
1174 		DBG(p.virt_start);
1175 		mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
1176 	}
1177 
1178 	/*
1179 	 * The hypervisor loads stuff starting at 1Gig
1180 	 */
1181 	mfn_base = ONE_GIG;
1182 	DBG(mfn_base);
1183 
1184 	/*
1185 	 * enable writable page table mode for the hypervisor
1186 	 */
1187 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1188 	    VMASST_TYPE_writable_pagetables) < 0)
1189 		dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
1190 
1191 	/*
1192 	 * check for NX support
1193 	 */
1194 	if (pae_support) {
1195 		uint32_t eax = 0x80000000;
1196 		uint32_t edx = get_cpuid_edx(&eax);
1197 
1198 		if (eax >= 0x80000001) {
1199 			eax = 0x80000001;
1200 			edx = get_cpuid_edx(&eax);
1201 			if (edx & CPUID_AMD_EDX_NX)
1202 				NX_support = 1;
1203 		}
1204 	}
1205 
1206 #if !defined(_BOOT_TARGET_amd64)
1207 
1208 	/*
1209 	 * The 32-bit hypervisor uses segmentation to protect itself from
1210 	 * guests. This means when a guest attempts to install a flat 4GB
1211 	 * code or data descriptor the 32-bit hypervisor will protect itself
1212 	 * by silently shrinking the segment such that if the guest attempts
1213 	 * any access where the hypervisor lives a #gp fault is generated.
1214 	 * The problem is that some applications expect a full 4GB flat
1215 	 * segment for their current thread pointer and will use negative
1216 	 * offset segment wrap around to access data. TLS support in linux
1217 	 * brand is one example of this.
1218 	 *
1219 	 * The 32-bit hypervisor can catch the #gp fault in these cases
1220 	 * and emulate the access without passing the #gp fault to the guest
1221 	 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
1222 	 * Seems like this should have been the default.
1223 	 * Either way, we want the hypervisor -- and not Solaris -- to deal
1224 	 * to deal with emulating these accesses.
1225 	 */
1226 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1227 	    VMASST_TYPE_4gb_segments) < 0)
1228 		dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
1229 #endif	/* !_BOOT_TARGET_amd64 */
1230 
1231 #else	/* __xpv */
1232 
1233 	/*
1234 	 * use cpuid to enable MMU features
1235 	 */
1236 	if (have_cpuid()) {
1237 		uint32_t eax, edx;
1238 
1239 		eax = 1;
1240 		edx = get_cpuid_edx(&eax);
1241 		if (edx & CPUID_INTC_EDX_PSE)
1242 			largepage_support = 1;
1243 		if (edx & CPUID_INTC_EDX_PGE)
1244 			pge_support = 1;
1245 		if (edx & CPUID_INTC_EDX_PAE)
1246 			pae_support = 1;
1247 
1248 		eax = 0x80000000;
1249 		edx = get_cpuid_edx(&eax);
1250 		if (eax >= 0x80000001) {
1251 			eax = 0x80000001;
1252 			edx = get_cpuid_edx(&eax);
1253 			if (edx & CPUID_AMD_EDX_LM)
1254 				amd64_support = 1;
1255 			if (edx & CPUID_AMD_EDX_NX)
1256 				NX_support = 1;
1257 		}
1258 	} else {
1259 		dboot_printf("cpuid not supported\n");
1260 	}
1261 #endif /* __xpv */
1262 
1263 
1264 #if defined(_BOOT_TARGET_amd64)
1265 	if (amd64_support == 0)
1266 		dboot_panic("long mode not supported, rebooting");
1267 	else if (pae_support == 0)
1268 		dboot_panic("long mode, but no PAE; rebooting");
1269 #else
1270 	/*
1271 	 * Allow the command line to over-ride use of PAE for 32 bit.
1272 	 */
1273 	if (strstr(cmdline, "disablePAE=true") != NULL) {
1274 		pae_support = 0;
1275 		NX_support = 0;
1276 		amd64_support = 0;
1277 	}
1278 #endif
1279 
1280 	/*
1281 	 * initialize the simple memory allocator
1282 	 */
1283 	init_mem_alloc();
1284 
1285 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
1286 	/*
1287 	 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
1288 	 */
1289 	if (max_mem < FOUR_GIG && NX_support == 0)
1290 		pae_support = 0;
1291 #endif
1292 
1293 	/*
1294 	 * configure mmu information
1295 	 */
1296 	if (pae_support) {
1297 		shift_amt = shift_amt_pae;
1298 		ptes_per_table = 512;
1299 		pte_size = 8;
1300 		lpagesize = TWO_MEG;
1301 #if defined(_BOOT_TARGET_amd64)
1302 		top_level = 3;
1303 #else
1304 		top_level = 2;
1305 #endif
1306 	} else {
1307 		pae_support = 0;
1308 		NX_support = 0;
1309 		shift_amt = shift_amt_nopae;
1310 		ptes_per_table = 1024;
1311 		pte_size = 4;
1312 		lpagesize = FOUR_MEG;
1313 		top_level = 1;
1314 	}
1315 
1316 	DBG(pge_support);
1317 	DBG(NX_support);
1318 	DBG(largepage_support);
1319 	DBG(amd64_support);
1320 	DBG(top_level);
1321 	DBG(pte_size);
1322 	DBG(ptes_per_table);
1323 	DBG(lpagesize);
1324 
1325 #if defined(__xpv)
1326 	ktext_phys = ONE_GIG;		/* from UNIX Mapfile */
1327 #else
1328 	ktext_phys = FOUR_MEG;		/* from UNIX Mapfile */
1329 #endif
1330 
1331 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
1332 	/*
1333 	 * For grub, copy kernel bits from the ELF64 file to final place.
1334 	 */
1335 	DBG_MSG("\nAllocating nucleus pages.\n");
1336 	ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
1337 	if (ktext_phys == 0)
1338 		dboot_panic("failed to allocate aligned kernel memory");
1339 	if (dboot_elfload64(mb_header.load_addr) != 0)
1340 		dboot_panic("failed to parse kernel ELF image, rebooting");
1341 #endif
1342 
1343 	DBG(ktext_phys);
1344 
1345 	/*
1346 	 * Allocate page tables.
1347 	 */
1348 	build_page_tables();
1349 
1350 	/*
1351 	 * return to assembly code to switch to running kernel
1352 	 */
1353 	entry_addr_low = (uint32_t)target_kernel_text;
1354 	DBG(entry_addr_low);
1355 	bi->bi_use_largepage = largepage_support;
1356 	bi->bi_use_pae = pae_support;
1357 	bi->bi_use_pge = pge_support;
1358 	bi->bi_use_nx = NX_support;
1359 
1360 #if defined(__xpv)
1361 
1362 	bi->bi_next_paddr = next_avail_addr - mfn_base;
1363 	DBG(bi->bi_next_paddr);
1364 	bi->bi_next_vaddr = (native_ptr_t)next_avail_addr;
1365 	DBG(bi->bi_next_vaddr);
1366 
1367 	/*
1368 	 * unmap unused pages in start area to make them available for DMA
1369 	 */
1370 	while (next_avail_addr < scratch_end) {
1371 		(void) HYPERVISOR_update_va_mapping(next_avail_addr,
1372 		    0, UVMF_INVLPG | UVMF_LOCAL);
1373 		next_avail_addr += MMU_PAGESIZE;
1374 	}
1375 
1376 	bi->bi_xen_start_info = (uintptr_t)xen_info;
1377 	DBG((uintptr_t)HYPERVISOR_shared_info);
1378 	bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
1379 	bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
1380 
1381 #else /* __xpv */
1382 
1383 	bi->bi_next_paddr = next_avail_addr;
1384 	DBG(bi->bi_next_paddr);
1385 	bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
1386 	DBG(bi->bi_next_vaddr);
1387 	bi->bi_mb_info = (uintptr_t)mb_info;
1388 	bi->bi_top_page_table = (uintptr_t)top_page_table;
1389 
1390 #endif /* __xpv */
1391 
1392 	bi->bi_kseg_size = FOUR_MEG;
1393 	DBG(bi->bi_kseg_size);
1394 
1395 #ifndef __xpv
1396 	if (map_debug)
1397 		dump_tables();
1398 #endif
1399 
1400 	DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
1401 }
1402