xref: /titanic_51/usr/src/uts/i86pc/dboot/dboot_startkern.c (revision 7ff836697c120cb94bd30d5c2204eb9b74718e4c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 
28 #include <sys/types.h>
29 #include <sys/machparam.h>
30 #include <sys/x86_archext.h>
31 #include <sys/systm.h>
32 #include <sys/mach_mmu.h>
33 #include <sys/multiboot.h>
34 
35 #if defined(__xpv)
36 
37 #include <sys/hypervisor.h>
38 uintptr_t xen_virt_start;
39 pfn_t *mfn_to_pfn_mapping;
40 
41 #else /* !__xpv */
42 
43 extern multiboot_header_t mb_header;
44 extern int have_cpuid(void);
45 
46 #endif /* !__xpv */
47 
48 #include <sys/inttypes.h>
49 #include <sys/bootinfo.h>
50 #include <sys/mach_mmu.h>
51 #include <sys/boot_console.h>
52 
53 #include "dboot_asm.h"
54 #include "dboot_printf.h"
55 #include "dboot_xboot.h"
56 #include "dboot_elfload.h"
57 
58 /*
59  * This file contains code that runs to transition us from either a multiboot
60  * compliant loader (32 bit non-paging) or a XPV domain loader to
61  * regular kernel execution. Its task is to setup the kernel memory image
62  * and page tables.
63  *
64  * The code executes as:
65  *	- 32 bits under GRUB (for 32 or 64 bit Solaris)
66  * 	- a 32 bit program for the 32-bit PV hypervisor
67  *	- a 64 bit program for the 64-bit PV hypervisor (at least for now)
68  *
69  * Under the PV hypervisor, we must create mappings for any memory beyond the
70  * initial start of day allocation (such as the kernel itself).
71  *
72  * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
73  * Since we are running in real mode, so all such memory is accessible.
74  */
75 
76 /*
77  * Standard bits used in PTE (page level) and PTP (internal levels)
78  */
79 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
80 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
81 
82 /*
83  * This is the target addresses (physical) where the kernel text and data
84  * nucleus pages will be unpacked. On the hypervisor this is actually a
85  * virtual address.
86  */
87 paddr_t ktext_phys;
88 uint32_t ksize = 2 * FOUR_MEG;	/* kernel nucleus is 8Meg */
89 
90 static uint64_t target_kernel_text;	/* value to use for KERNEL_TEXT */
91 
92 /*
93  * The stack is setup in assembler before entering startup_kernel()
94  */
95 char stack_space[STACK_SIZE];
96 
97 /*
98  * Used to track physical memory allocation
99  */
100 static paddr_t next_avail_addr = 0;
101 
102 #if defined(__xpv)
103 /*
104  * Additional information needed for hypervisor memory allocation.
105  * Only memory up to scratch_end is mapped by page tables.
106  * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
107  * to derive a pfn from a pointer, you subtract mfn_base.
108  */
109 
110 static paddr_t scratch_end = 0;	/* we can't write all of mem here */
111 static paddr_t mfn_base;		/* addr corresponding to mfn_list[0] */
112 start_info_t *xen_info;
113 
114 #else	/* __xpv */
115 
116 /*
117  * If on the metal, then we have a multiboot loader.
118  */
119 multiboot_info_t *mb_info;
120 
121 #endif	/* __xpv */
122 
123 /*
124  * This contains information passed to the kernel
125  */
126 struct xboot_info boot_info[2];	/* extra space to fix alignement for amd64 */
127 struct xboot_info *bi;
128 
129 /*
130  * Page table and memory stuff.
131  */
132 static paddr_t max_mem;			/* maximum memory address */
133 
134 /*
135  * Information about processor MMU
136  */
137 int amd64_support = 0;
138 int largepage_support = 0;
139 int pae_support = 0;
140 int pge_support = 0;
141 int NX_support = 0;
142 
143 /*
144  * Low 32 bits of kernel entry address passed back to assembler.
145  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
146  */
147 uint32_t entry_addr_low;
148 
149 /*
150  * Memlists for the kernel. We shouldn't need a lot of these.
151  */
152 #define	MAX_MEMLIST (50)
153 struct boot_memlist memlists[MAX_MEMLIST];
154 uint_t memlists_used = 0;
155 struct boot_memlist pcimemlists[MAX_MEMLIST];
156 uint_t pcimemlists_used = 0;
157 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
158 uint_t rsvdmemlists_used = 0;
159 
160 #define	MAX_MODULES (10)
161 struct boot_modules modules[MAX_MODULES];
162 uint_t modules_used = 0;
163 
164 /*
165  * Debugging macros
166  */
167 uint_t prom_debug = 0;
168 uint_t map_debug = 0;
169 
170 /*
171  * Either hypervisor-specific or grub-specific code builds the initial
172  * memlists. This code does the sort/merge/link for final use.
173  */
174 static void
175 sort_physinstall(void)
176 {
177 	int i;
178 #if !defined(__xpv)
179 	int j;
180 	struct boot_memlist tmp;
181 
182 	/*
183 	 * Now sort the memlists, in case they weren't in order.
184 	 * Yeah, this is a bubble sort; small, simple and easy to get right.
185 	 */
186 	DBG_MSG("Sorting phys-installed list\n");
187 	for (j = memlists_used - 1; j > 0; --j) {
188 		for (i = 0; i < j; ++i) {
189 			if (memlists[i].addr < memlists[i + 1].addr)
190 				continue;
191 			tmp = memlists[i];
192 			memlists[i] = memlists[i + 1];
193 			memlists[i + 1] = tmp;
194 		}
195 	}
196 
197 	/*
198 	 * Merge any memlists that don't have holes between them.
199 	 */
200 	for (i = 0; i <= memlists_used - 1; ++i) {
201 		if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
202 			continue;
203 
204 		if (prom_debug)
205 			dboot_printf(
206 			    "merging mem segs %" PRIx64 "...%" PRIx64
207 			    " w/ %" PRIx64 "...%" PRIx64 "\n",
208 			    memlists[i].addr,
209 			    memlists[i].addr + memlists[i].size,
210 			    memlists[i + 1].addr,
211 			    memlists[i + 1].addr + memlists[i + 1].size);
212 
213 		memlists[i].size += memlists[i + 1].size;
214 		for (j = i + 1; j < memlists_used - 1; ++j)
215 			memlists[j] = memlists[j + 1];
216 		--memlists_used;
217 		DBG(memlists_used);
218 		--i;	/* after merging we need to reexamine, so do this */
219 	}
220 #endif	/* __xpv */
221 
222 	if (prom_debug) {
223 		dboot_printf("\nFinal memlists:\n");
224 		for (i = 0; i < memlists_used; ++i) {
225 			dboot_printf("\t%d: addr=%" PRIx64 " size=%"
226 			    PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
227 		}
228 	}
229 
230 	/*
231 	 * link together the memlists with native size pointers
232 	 */
233 	memlists[0].next = 0;
234 	memlists[0].prev = 0;
235 	for (i = 1; i < memlists_used; ++i) {
236 		memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
237 		memlists[i].next = 0;
238 		memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
239 	}
240 	bi->bi_phys_install = (native_ptr_t)memlists;
241 	DBG(bi->bi_phys_install);
242 }
243 
244 /*
245  * build bios reserved memlists
246  */
247 static void
248 build_rsvdmemlists(void)
249 {
250 	int i;
251 
252 	rsvdmemlists[0].next = 0;
253 	rsvdmemlists[0].prev = 0;
254 	for (i = 1; i < rsvdmemlists_used; ++i) {
255 		rsvdmemlists[i].prev =
256 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
257 		rsvdmemlists[i].next = 0;
258 		rsvdmemlists[i - 1].next =
259 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
260 	}
261 	bi->bi_rsvdmem = (native_ptr_t)rsvdmemlists;
262 	DBG(bi->bi_rsvdmem);
263 }
264 
265 #if defined(__xpv)
266 
267 /*
268  * halt on the hypervisor after a delay to drain console output
269  */
270 void
271 dboot_halt(void)
272 {
273 	uint_t i = 10000;
274 
275 	while (--i)
276 		HYPERVISOR_yield();
277 	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
278 }
279 
280 /*
281  * From a machine address, find the corresponding pseudo-physical address.
282  * Pseudo-physical address are contiguous and run from mfn_base in each VM.
283  * Machine addresses are the real underlying hardware addresses.
284  * These are needed for page table entries. Note that this routine is
285  * poorly protected. A bad value of "ma" will cause a page fault.
286  */
287 paddr_t
288 ma_to_pa(maddr_t ma)
289 {
290 	ulong_t pgoff = ma & MMU_PAGEOFFSET;
291 	ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
292 	paddr_t pa;
293 
294 	if (pfn >= xen_info->nr_pages)
295 		return (-(paddr_t)1);
296 	pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
297 #ifdef DEBUG
298 	if (ma != pa_to_ma(pa))
299 		dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
300 		    "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
301 #endif
302 	return (pa);
303 }
304 
305 /*
306  * From a pseudo-physical address, find the corresponding machine address.
307  */
308 maddr_t
309 pa_to_ma(paddr_t pa)
310 {
311 	pfn_t pfn;
312 	ulong_t mfn;
313 
314 	pfn = mmu_btop(pa - mfn_base);
315 	if (pa < mfn_base || pfn >= xen_info->nr_pages)
316 		dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
317 	mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
318 #ifdef DEBUG
319 	if (mfn_to_pfn_mapping[mfn] != pfn)
320 		dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
321 		    pfn, mfn, mfn_to_pfn_mapping[mfn]);
322 #endif
323 	return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
324 }
325 
326 #endif	/* __xpv */
327 
328 x86pte_t
329 get_pteval(paddr_t table, uint_t index)
330 {
331 	if (pae_support)
332 		return (((x86pte_t *)(uintptr_t)table)[index]);
333 	return (((x86pte32_t *)(uintptr_t)table)[index]);
334 }
335 
336 /*ARGSUSED*/
337 void
338 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
339 {
340 #ifdef __xpv
341 	mmu_update_t t;
342 	maddr_t mtable = pa_to_ma(table);
343 	int retcnt;
344 
345 	t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
346 	t.val = pteval;
347 	if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
348 		dboot_panic("HYPERVISOR_mmu_update() failed");
349 #else /* __xpv */
350 	uintptr_t tab_addr = (uintptr_t)table;
351 
352 	if (pae_support)
353 		((x86pte_t *)tab_addr)[index] = pteval;
354 	else
355 		((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
356 	if (level == top_level && level == 2)
357 		reload_cr3();
358 #endif /* __xpv */
359 }
360 
361 paddr_t
362 make_ptable(x86pte_t *pteval, uint_t level)
363 {
364 	paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
365 
366 	if (level == top_level && level == 2)
367 		*pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
368 	else
369 		*pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
370 
371 #ifdef __xpv
372 	/* Remove write permission to the new page table. */
373 	if (HYPERVISOR_update_va_mapping(new_table,
374 	    *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
375 		dboot_panic("HYP_update_va_mapping error");
376 #endif
377 
378 	if (map_debug)
379 		dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
380 		    PRIx64 "\n", level, (ulong_t)new_table, *pteval);
381 	return (new_table);
382 }
383 
384 x86pte_t *
385 map_pte(paddr_t table, uint_t index)
386 {
387 	return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
388 }
389 
390 /*
391  * dump out the contents of page tables...
392  */
393 static void
394 dump_tables(void)
395 {
396 	uint_t save_index[4];	/* for recursion */
397 	char *save_table[4];	/* for recursion */
398 	uint_t	l;
399 	uint64_t va;
400 	uint64_t pgsize;
401 	int index;
402 	int i;
403 	x86pte_t pteval;
404 	char *table;
405 	static char *tablist = "\t\t\t";
406 	char *tabs = tablist + 3 - top_level;
407 	uint_t pa, pa1;
408 #if !defined(__xpv)
409 #define	maddr_t paddr_t
410 #endif /* !__xpv */
411 
412 	dboot_printf("Finished pagetables:\n");
413 	table = (char *)(uintptr_t)top_page_table;
414 	l = top_level;
415 	va = 0;
416 	for (index = 0; index < ptes_per_table; ++index) {
417 		pgsize = 1ull << shift_amt[l];
418 		if (pae_support)
419 			pteval = ((x86pte_t *)table)[index];
420 		else
421 			pteval = ((x86pte32_t *)table)[index];
422 		if (pteval == 0)
423 			goto next_entry;
424 
425 		dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
426 		    tabs + l, table, index, (uint64_t)pteval, va);
427 		pa = ma_to_pa(pteval & MMU_PAGEMASK);
428 		dboot_printf(" physaddr=%x\n", pa);
429 
430 		/*
431 		 * Don't try to walk hypervisor private pagetables
432 		 */
433 		if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
434 			save_table[l] = table;
435 			save_index[l] = index;
436 			--l;
437 			index = -1;
438 			table = (char *)(uintptr_t)
439 			    ma_to_pa(pteval & MMU_PAGEMASK);
440 			goto recursion;
441 		}
442 
443 		/*
444 		 * shorten dump for consecutive mappings
445 		 */
446 		for (i = 1; index + i < ptes_per_table; ++i) {
447 			if (pae_support)
448 				pteval = ((x86pte_t *)table)[index + i];
449 			else
450 				pteval = ((x86pte32_t *)table)[index + i];
451 			if (pteval == 0)
452 				break;
453 			pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
454 			if (pa1 != pa + i * pgsize)
455 				break;
456 		}
457 		if (i > 2) {
458 			dboot_printf("%s...\n", tabs + l);
459 			va += pgsize * (i - 2);
460 			index += i - 2;
461 		}
462 next_entry:
463 		va += pgsize;
464 		if (l == 3 && index == 256)	/* VA hole */
465 			va = 0xffff800000000000ull;
466 recursion:
467 		;
468 	}
469 	if (l < top_level) {
470 		++l;
471 		index = save_index[l];
472 		table = save_table[l];
473 		goto recursion;
474 	}
475 }
476 
477 /*
478  * Add a mapping for the machine page at the given virtual address.
479  */
480 static void
481 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
482 {
483 	x86pte_t *ptep;
484 	x86pte_t pteval;
485 
486 	pteval = ma | pte_bits;
487 	if (level > 0)
488 		pteval |= PT_PAGESIZE;
489 	if (va >= target_kernel_text && pge_support)
490 		pteval |= PT_GLOBAL;
491 
492 	if (map_debug && ma != va)
493 		dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
494 		    " pte=0x%" PRIx64 " l=%d\n",
495 		    (uint64_t)ma, (uint64_t)va, pteval, level);
496 
497 #if defined(__xpv)
498 	/*
499 	 * see if we can avoid find_pte() on the hypervisor
500 	 */
501 	if (HYPERVISOR_update_va_mapping(va, pteval,
502 	    UVMF_INVLPG | UVMF_LOCAL) == 0)
503 		return;
504 #endif
505 
506 	/*
507 	 * Find the pte that will map this address. This creates any
508 	 * missing intermediate level page tables
509 	 */
510 	ptep = find_pte(va, NULL, level, 0);
511 
512 	/*
513 	 * When paravirtualized, we must use hypervisor calls to modify the
514 	 * PTE, since paging is active. On real hardware we just write to
515 	 * the pagetables which aren't in use yet.
516 	 */
517 #if defined(__xpv)
518 	ptep = ptep;	/* shut lint up */
519 	if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
520 		dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
521 		    " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
522 		    (uint64_t)va, level, (uint64_t)ma, pteval);
523 #else
524 	if (va < 1024 * 1024)
525 		pteval |= PT_NOCACHE;		/* for video RAM */
526 	if (pae_support)
527 		*ptep = pteval;
528 	else
529 		*((x86pte32_t *)ptep) = (x86pte32_t)pteval;
530 #endif
531 }
532 
533 /*
534  * Add a mapping for the physical page at the given virtual address.
535  */
536 static void
537 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
538 {
539 	map_ma_at_va(pa_to_ma(pa), va, level);
540 }
541 
542 /*
543  * This is called to remove start..end from the
544  * possible range of PCI addresses.
545  */
546 const uint64_t pci_lo_limit = 0x00100000ul;
547 const uint64_t pci_hi_limit = 0xfff00000ul;
548 static void
549 exclude_from_pci(uint64_t start, uint64_t end)
550 {
551 	int i;
552 	int j;
553 	struct boot_memlist *ml;
554 
555 	for (i = 0; i < pcimemlists_used; ++i) {
556 		ml = &pcimemlists[i];
557 
558 		/* delete the entire range? */
559 		if (start <= ml->addr && ml->addr + ml->size <= end) {
560 			--pcimemlists_used;
561 			for (j = i; j < pcimemlists_used; ++j)
562 				pcimemlists[j] = pcimemlists[j + 1];
563 			--i;	/* to revisit the new one at this index */
564 		}
565 
566 		/* split a range? */
567 		else if (ml->addr < start && end < ml->addr + ml->size) {
568 
569 			++pcimemlists_used;
570 			if (pcimemlists_used > MAX_MEMLIST)
571 				dboot_panic("too many pcimemlists");
572 
573 			for (j = pcimemlists_used - 1; j > i; --j)
574 				pcimemlists[j] = pcimemlists[j - 1];
575 			ml->size = start - ml->addr;
576 
577 			++ml;
578 			ml->size = (ml->addr + ml->size) - end;
579 			ml->addr = end;
580 			++i;	/* skip on to next one */
581 		}
582 
583 		/* cut memory off the start? */
584 		else if (ml->addr < end && end < ml->addr + ml->size) {
585 			ml->size -= end - ml->addr;
586 			ml->addr = end;
587 		}
588 
589 		/* cut memory off the end? */
590 		else if (ml->addr <= start && start < ml->addr + ml->size) {
591 			ml->size = start - ml->addr;
592 		}
593 	}
594 }
595 
596 /*
597  * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
598  * definition in Xen source.
599  */
600 #ifdef __xpv
601 typedef struct {
602 	uint32_t	base_addr_low;
603 	uint32_t	base_addr_high;
604 	uint32_t	length_low;
605 	uint32_t	length_high;
606 	uint32_t	type;
607 } mmap_t;
608 #else
609 typedef mb_memory_map_t mmap_t;
610 #endif
611 
612 static void
613 build_pcimemlists(mmap_t *mem, int num)
614 {
615 	mmap_t *mmap;
616 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
617 	uint64_t start;
618 	uint64_t end;
619 	int i;
620 
621 	/*
622 	 * initialize
623 	 */
624 	pcimemlists[0].addr = pci_lo_limit;
625 	pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
626 	pcimemlists_used = 1;
627 
628 	/*
629 	 * Fill in PCI memlists.
630 	 */
631 	for (mmap = mem, i = 0; i < num; ++i, ++mmap) {
632 		start = ((uint64_t)mmap->base_addr_high << 32) +
633 		    mmap->base_addr_low;
634 		end = start + ((uint64_t)mmap->length_high << 32) +
635 		    mmap->length_low;
636 
637 		if (prom_debug)
638 			dboot_printf("\ttype: %d %" PRIx64 "..%"
639 			    PRIx64 "\n", mmap->type, start, end);
640 
641 		/*
642 		 * page align start and end
643 		 */
644 		start = (start + page_offset) & ~page_offset;
645 		end &= ~page_offset;
646 		if (end <= start)
647 			continue;
648 
649 		exclude_from_pci(start, end);
650 	}
651 
652 	/*
653 	 * Finish off the pcimemlist
654 	 */
655 	if (prom_debug) {
656 		for (i = 0; i < pcimemlists_used; ++i) {
657 			dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
658 			    PRIx64 "\n", pcimemlists[i].addr,
659 			    pcimemlists[i].addr + pcimemlists[i].size);
660 		}
661 	}
662 	pcimemlists[0].next = 0;
663 	pcimemlists[0].prev = 0;
664 	for (i = 1; i < pcimemlists_used; ++i) {
665 		pcimemlists[i].prev =
666 		    (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
667 		pcimemlists[i].next = 0;
668 		pcimemlists[i - 1].next =
669 		    (native_ptr_t)(uintptr_t)(pcimemlists + i);
670 	}
671 	bi->bi_pcimem = (native_ptr_t)pcimemlists;
672 	DBG(bi->bi_pcimem);
673 }
674 
675 #if defined(__xpv)
676 /*
677  * Initialize memory allocator stuff from hypervisor-supplied start info.
678  *
679  * There is 512KB of scratch area after the boot stack page.
680  * We'll use that for everything except the kernel nucleus pages which are too
681  * big to fit there and are allocated last anyway.
682  */
683 #define	MAXMAPS	100
684 static mmap_t map_buffer[MAXMAPS];
685 static void
686 init_mem_alloc(void)
687 {
688 	int	local;	/* variables needed to find start region */
689 	paddr_t	scratch_start;
690 	xen_memory_map_t map;
691 
692 	DBG_MSG("Entered init_mem_alloc()\n");
693 
694 	/*
695 	 * Free memory follows the stack. There's at least 512KB of scratch
696 	 * space, rounded up to at least 2Mb alignment.  That should be enough
697 	 * for the page tables we'll need to build.  The nucleus memory is
698 	 * allocated last and will be outside the addressible range.  We'll
699 	 * switch to new page tables before we unpack the kernel
700 	 */
701 	scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
702 	DBG(scratch_start);
703 	scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
704 	DBG(scratch_end);
705 
706 	/*
707 	 * For paranoia, leave some space between hypervisor data and ours.
708 	 * Use 500 instead of 512.
709 	 */
710 	next_avail_addr = scratch_end - 500 * 1024;
711 	DBG(next_avail_addr);
712 
713 	/*
714 	 * The domain builder gives us at most 1 module
715 	 */
716 	DBG(xen_info->mod_len);
717 	if (xen_info->mod_len > 0) {
718 		DBG(xen_info->mod_start);
719 		modules[0].bm_addr = xen_info->mod_start;
720 		modules[0].bm_size = xen_info->mod_len;
721 		bi->bi_module_cnt = 1;
722 		bi->bi_modules = (native_ptr_t)modules;
723 	} else {
724 		bi->bi_module_cnt = 0;
725 		bi->bi_modules = NULL;
726 	}
727 	DBG(bi->bi_module_cnt);
728 	DBG(bi->bi_modules);
729 
730 	DBG(xen_info->mfn_list);
731 	DBG(xen_info->nr_pages);
732 	max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
733 	DBG(max_mem);
734 
735 	/*
736 	 * Using pseudo-physical addresses, so only 1 memlist element
737 	 */
738 	memlists[0].addr = 0;
739 	DBG(memlists[0].addr);
740 	memlists[0].size = max_mem;
741 	DBG(memlists[0].size);
742 	memlists_used = 1;
743 	DBG(memlists_used);
744 
745 	/*
746 	 * finish building physinstall list
747 	 */
748 	sort_physinstall();
749 
750 	/*
751 	 * build bios reserved memlists
752 	 */
753 	build_rsvdmemlists();
754 
755 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
756 		/*
757 		 * build PCI Memory list
758 		 */
759 		map.nr_entries = MAXMAPS;
760 		/*LINTED: constant in conditional context*/
761 		set_xen_guest_handle(map.buffer, map_buffer);
762 		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
763 			dboot_panic("getting XENMEM_machine_memory_map failed");
764 		build_pcimemlists(map_buffer, map.nr_entries);
765 	}
766 }
767 
768 #else	/* !__xpv */
769 
770 /*
771  * During memory allocation, find the highest address not used yet.
772  */
773 static void
774 check_higher(paddr_t a)
775 {
776 	if (a < next_avail_addr)
777 		return;
778 	next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
779 	DBG(next_avail_addr);
780 }
781 
782 /*
783  * Walk through the module information finding the last used address.
784  * The first available address will become the top level page table.
785  *
786  * We then build the phys_install memlist from the multiboot information.
787  */
788 static void
789 init_mem_alloc(void)
790 {
791 	mb_memory_map_t *mmap;
792 	mb_module_t *mod;
793 	uint64_t start;
794 	uint64_t end;
795 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
796 	extern char _end[];
797 	int i;
798 
799 	DBG_MSG("Entered init_mem_alloc()\n");
800 	DBG((uintptr_t)mb_info);
801 
802 	/*
803 	 * search the modules to find the last used address
804 	 * we'll build the module list while we're walking through here
805 	 */
806 	DBG_MSG("\nFinding Modules\n");
807 	check_higher((paddr_t)&_end);
808 	for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
809 	    i < mb_info->mods_count;
810 	    ++mod, ++i) {
811 		if (prom_debug) {
812 			dboot_printf("\tmodule #%d: %s at: 0x%lx, len 0x%lx\n",
813 			    i, (char *)(mod->mod_name),
814 			    (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
815 		}
816 		modules[i].bm_addr = mod->mod_start;
817 		modules[i].bm_size = mod->mod_end;
818 
819 		check_higher(mod->mod_end);
820 	}
821 	bi->bi_modules = (native_ptr_t)modules;
822 	DBG(bi->bi_modules);
823 	bi->bi_module_cnt = mb_info->mods_count;
824 	DBG(bi->bi_module_cnt);
825 
826 	/*
827 	 * Walk through the memory map from multiboot and build our memlist
828 	 * structures. Note these will have native format pointers.
829 	 */
830 	DBG_MSG("\nFinding Memory Map\n");
831 	DBG(mb_info->flags);
832 	max_mem = 0;
833 	if (mb_info->flags & 0x40) {
834 		int cnt = 0;
835 
836 		DBG(mb_info->mmap_addr);
837 		DBG(mb_info->mmap_length);
838 		check_higher(mb_info->mmap_addr + mb_info->mmap_length);
839 
840 		for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
841 		    (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length;
842 		    mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size
843 		    + sizeof (mmap->size))) {
844 			++cnt;
845 			start = ((uint64_t)mmap->base_addr_high << 32) +
846 			    mmap->base_addr_low;
847 			end = start + ((uint64_t)mmap->length_high << 32) +
848 			    mmap->length_low;
849 
850 			if (prom_debug)
851 				dboot_printf("\ttype: %d %" PRIx64 "..%"
852 				    PRIx64 "\n", mmap->type, start, end);
853 
854 			/*
855 			 * page align start and end
856 			 */
857 			start = (start + page_offset) & ~page_offset;
858 			end &= ~page_offset;
859 			if (end <= start)
860 				continue;
861 
862 			/*
863 			 * only type 1 is usable RAM
864 			 */
865 			switch (mmap->type) {
866 			case 1:
867 				if (end > max_mem)
868 					max_mem = end;
869 				memlists[memlists_used].addr = start;
870 				memlists[memlists_used].size = end - start;
871 				++memlists_used;
872 				if (memlists_used > MAX_MEMLIST)
873 					dboot_panic("too many memlists");
874 				break;
875 			case 2:
876 				rsvdmemlists[rsvdmemlists_used].addr = start;
877 				rsvdmemlists[rsvdmemlists_used].size =
878 				    end - start;
879 				++rsvdmemlists_used;
880 				if (rsvdmemlists_used > MAX_MEMLIST)
881 					dboot_panic("too many rsvdmemlists");
882 				break;
883 			default:
884 				continue;
885 			}
886 		}
887 		build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt);
888 	} else if (mb_info->flags & 0x01) {
889 		DBG(mb_info->mem_lower);
890 		memlists[memlists_used].addr = 0;
891 		memlists[memlists_used].size = mb_info->mem_lower * 1024;
892 		++memlists_used;
893 		DBG(mb_info->mem_upper);
894 		memlists[memlists_used].addr = 1024 * 1024;
895 		memlists[memlists_used].size = mb_info->mem_upper * 1024;
896 		++memlists_used;
897 
898 		/*
899 		 * Old platform - assume I/O space at the end of memory.
900 		 */
901 		pcimemlists[0].addr =
902 		    (mb_info->mem_upper * 1024) + (1024 * 1024);
903 		pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
904 		pcimemlists[0].next = 0;
905 		pcimemlists[0].prev = 0;
906 		bi->bi_pcimem = (native_ptr_t)pcimemlists;
907 		DBG(bi->bi_pcimem);
908 	} else {
909 		dboot_panic("No memory info from boot loader!!!");
910 	}
911 
912 	check_higher(bi->bi_cmdline);
913 
914 	/*
915 	 * finish processing the physinstall list
916 	 */
917 	sort_physinstall();
918 
919 	/*
920 	 * build bios reserved mem lists
921 	 */
922 	build_rsvdmemlists();
923 }
924 #endif /* !__xpv */
925 
926 /*
927  * Simple memory allocator, allocates aligned physical memory.
928  * Note that startup_kernel() only allocates memory, never frees.
929  * Memory usage just grows in an upward direction.
930  */
931 static void *
932 do_mem_alloc(uint32_t size, uint32_t align)
933 {
934 	uint_t i;
935 	uint64_t best;
936 	uint64_t start;
937 	uint64_t end;
938 
939 	/*
940 	 * make sure size is a multiple of pagesize
941 	 */
942 	size = RNDUP(size, MMU_PAGESIZE);
943 	next_avail_addr = RNDUP(next_avail_addr, align);
944 
945 	/*
946 	 * XXPV fixme joe
947 	 *
948 	 * a really large bootarchive that causes you to run out of memory
949 	 * may cause this to blow up
950 	 */
951 	/* LINTED E_UNEXPECTED_UINT_PROMOTION */
952 	best = (uint64_t)-size;
953 	for (i = 0; i < memlists_used; ++i) {
954 		start = memlists[i].addr;
955 #if defined(__xpv)
956 		start += mfn_base;
957 #endif
958 		end = start + memlists[i].size;
959 
960 		/*
961 		 * did we find the desired address?
962 		 */
963 		if (start <= next_avail_addr && next_avail_addr + size <= end) {
964 			best = next_avail_addr;
965 			goto done;
966 		}
967 
968 		/*
969 		 * if not is this address the best so far?
970 		 */
971 		if (start > next_avail_addr && start < best &&
972 		    RNDUP(start, align) + size <= end)
973 			best = RNDUP(start, align);
974 	}
975 
976 	/*
977 	 * We didn't find exactly the address we wanted, due to going off the
978 	 * end of a memory region. Return the best found memory address.
979 	 */
980 done:
981 	next_avail_addr = best + size;
982 #if defined(__xpv)
983 	if (next_avail_addr > scratch_end)
984 		dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
985 		    "0x%lx", (ulong_t)next_avail_addr,
986 		    (ulong_t)scratch_end);
987 #endif
988 	(void) memset((void *)(uintptr_t)best, 0, size);
989 	return ((void *)(uintptr_t)best);
990 }
991 
992 void *
993 mem_alloc(uint32_t size)
994 {
995 	return (do_mem_alloc(size, MMU_PAGESIZE));
996 }
997 
998 
999 /*
1000  * Build page tables to map all of memory used so far as well as the kernel.
1001  */
1002 static void
1003 build_page_tables(void)
1004 {
1005 	uint32_t psize;
1006 	uint32_t level;
1007 	uint32_t off;
1008 	uint64_t start;
1009 #if !defined(__xpv)
1010 	uint32_t i;
1011 	uint64_t end;
1012 #endif	/* __xpv */
1013 
1014 	/*
1015 	 * If we're on metal, we need to create the top level pagetable.
1016 	 */
1017 #if defined(__xpv)
1018 	top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1019 #else /* __xpv */
1020 	top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1021 #endif /* __xpv */
1022 	DBG((uintptr_t)top_page_table);
1023 
1024 	/*
1025 	 * Determine if we'll use large mappings for kernel, then map it.
1026 	 */
1027 	if (largepage_support) {
1028 		psize = lpagesize;
1029 		level = 1;
1030 	} else {
1031 		psize = MMU_PAGESIZE;
1032 		level = 0;
1033 	}
1034 
1035 	DBG_MSG("Mapping kernel\n");
1036 	DBG(ktext_phys);
1037 	DBG(target_kernel_text);
1038 	DBG(ksize);
1039 	DBG(psize);
1040 	for (off = 0; off < ksize; off += psize)
1041 		map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1042 
1043 	/*
1044 	 * The kernel will need a 1 page window to work with page tables
1045 	 */
1046 	bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE);
1047 	DBG(bi->bi_pt_window);
1048 	bi->bi_pte_to_pt_window =
1049 	    (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1050 	DBG(bi->bi_pte_to_pt_window);
1051 
1052 #if defined(__xpv)
1053 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1054 		/* If this is a domU we're done. */
1055 		DBG_MSG("\nPage tables constructed\n");
1056 		return;
1057 	}
1058 #endif /* __xpv */
1059 
1060 	/*
1061 	 * We need 1:1 mappings for the lower 1M of memory to access
1062 	 * BIOS tables used by a couple of drivers during boot.
1063 	 *
1064 	 * The following code works because our simple memory allocator
1065 	 * only grows usage in an upwards direction.
1066 	 *
1067 	 * Note that by this point in boot some mappings for low memory
1068 	 * may already exist because we've already accessed device in low
1069 	 * memory.  (Specifically the video frame buffer and keyboard
1070 	 * status ports.)  If we're booting on raw hardware then GRUB
1071 	 * created these mappings for us.  If we're booting under a
1072 	 * hypervisor then we went ahead and remapped these devices into
1073 	 * memory allocated within dboot itself.
1074 	 */
1075 	if (map_debug)
1076 		dboot_printf("1:1 map pa=0..1Meg\n");
1077 	for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
1078 #if defined(__xpv)
1079 		map_ma_at_va(start, start, 0);
1080 #else /* __xpv */
1081 		map_pa_at_va(start, start, 0);
1082 #endif /* __xpv */
1083 	}
1084 
1085 #if !defined(__xpv)
1086 	for (i = 0; i < memlists_used; ++i) {
1087 		start = memlists[i].addr;
1088 
1089 		end = start + memlists[i].size;
1090 
1091 		if (map_debug)
1092 			dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
1093 			    start, end);
1094 		while (start < end && start < next_avail_addr) {
1095 			map_pa_at_va(start, start, 0);
1096 			start += MMU_PAGESIZE;
1097 		}
1098 	}
1099 #endif /* !__xpv */
1100 
1101 	DBG_MSG("\nPage tables constructed\n");
1102 }
1103 
1104 #define	NO_MULTIBOOT	\
1105 "multiboot is no longer used to boot the Solaris Operating System.\n\
1106 The grub entry should be changed to:\n\
1107 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
1108 module$ /platform/i86pc/$ISADIR/boot_archive\n\
1109 See http://www.sun.com/msg/SUNOS-8000-AK for details.\n"
1110 
1111 /*
1112  * startup_kernel has a pretty simple job. It builds pagetables which reflect
1113  * 1:1 mappings for all memory in use. It then also adds mappings for
1114  * the kernel nucleus at virtual address of target_kernel_text using large page
1115  * mappings. The page table pages are also accessible at 1:1 mapped
1116  * virtual addresses.
1117  */
1118 /*ARGSUSED*/
1119 void
1120 startup_kernel(void)
1121 {
1122 	char *cmdline;
1123 	uintptr_t addr;
1124 #if defined(__xpv)
1125 	physdev_set_iopl_t set_iopl;
1126 #endif /* __xpv */
1127 
1128 	/*
1129 	 * At this point we are executing in a 32 bit real mode.
1130 	 */
1131 #if defined(__xpv)
1132 	cmdline = (char *)xen_info->cmd_line;
1133 #else /* __xpv */
1134 	cmdline = (char *)mb_info->cmdline;
1135 #endif /* __xpv */
1136 
1137 	prom_debug = (strstr(cmdline, "prom_debug") != NULL);
1138 	map_debug = (strstr(cmdline, "map_debug") != NULL);
1139 
1140 #if defined(__xpv)
1141 	/*
1142 	 * For dom0, before we initialize the console subsystem we'll
1143 	 * need to enable io operations, so set I/O priveldge level to 1.
1144 	 */
1145 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1146 		set_iopl.iopl = 1;
1147 		(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1148 	}
1149 #endif /* __xpv */
1150 
1151 	bcons_init(cmdline);
1152 	DBG_MSG("\n\nSolaris prekernel set: ");
1153 	DBG_MSG(cmdline);
1154 	DBG_MSG("\n");
1155 
1156 	if (strstr(cmdline, "multiboot") != NULL) {
1157 		dboot_panic(NO_MULTIBOOT);
1158 	}
1159 
1160 	/*
1161 	 * boot info must be 16 byte aligned for 64 bit kernel ABI
1162 	 */
1163 	addr = (uintptr_t)boot_info;
1164 	addr = (addr + 0xf) & ~0xf;
1165 	bi = (struct xboot_info *)addr;
1166 	DBG((uintptr_t)bi);
1167 	bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
1168 
1169 	/*
1170 	 * Need correct target_kernel_text value
1171 	 */
1172 #if defined(_BOOT_TARGET_amd64)
1173 	target_kernel_text = KERNEL_TEXT_amd64;
1174 #elif defined(__xpv)
1175 	target_kernel_text = KERNEL_TEXT_i386_xpv;
1176 #else
1177 	target_kernel_text = KERNEL_TEXT_i386;
1178 #endif
1179 	DBG(target_kernel_text);
1180 
1181 #if defined(__xpv)
1182 
1183 	/*
1184 	 * XXPV	Derive this stuff from CPUID / what the hypervisor has enabled
1185 	 */
1186 
1187 #if defined(_BOOT_TARGET_amd64)
1188 	/*
1189 	 * 64-bit hypervisor.
1190 	 */
1191 	amd64_support = 1;
1192 	pae_support = 1;
1193 
1194 #else	/* _BOOT_TARGET_amd64 */
1195 
1196 	/*
1197 	 * See if we are running on a PAE Hypervisor
1198 	 */
1199 	{
1200 		xen_capabilities_info_t caps;
1201 
1202 		if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
1203 			dboot_panic("HYPERVISOR_xen_version(caps) failed");
1204 		caps[sizeof (caps) - 1] = 0;
1205 		if (prom_debug)
1206 			dboot_printf("xen capabilities %s\n", caps);
1207 		if (strstr(caps, "x86_32p") != NULL)
1208 			pae_support = 1;
1209 	}
1210 
1211 #endif	/* _BOOT_TARGET_amd64 */
1212 	{
1213 		xen_platform_parameters_t p;
1214 
1215 		if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
1216 			dboot_panic("HYPERVISOR_xen_version(parms) failed");
1217 		DBG(p.virt_start);
1218 		mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
1219 	}
1220 
1221 	/*
1222 	 * The hypervisor loads stuff starting at 1Gig
1223 	 */
1224 	mfn_base = ONE_GIG;
1225 	DBG(mfn_base);
1226 
1227 	/*
1228 	 * enable writable page table mode for the hypervisor
1229 	 */
1230 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1231 	    VMASST_TYPE_writable_pagetables) < 0)
1232 		dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
1233 
1234 	/*
1235 	 * check for NX support
1236 	 */
1237 	if (pae_support) {
1238 		uint32_t eax = 0x80000000;
1239 		uint32_t edx = get_cpuid_edx(&eax);
1240 
1241 		if (eax >= 0x80000001) {
1242 			eax = 0x80000001;
1243 			edx = get_cpuid_edx(&eax);
1244 			if (edx & CPUID_AMD_EDX_NX)
1245 				NX_support = 1;
1246 		}
1247 	}
1248 
1249 #if !defined(_BOOT_TARGET_amd64)
1250 
1251 	/*
1252 	 * The 32-bit hypervisor uses segmentation to protect itself from
1253 	 * guests. This means when a guest attempts to install a flat 4GB
1254 	 * code or data descriptor the 32-bit hypervisor will protect itself
1255 	 * by silently shrinking the segment such that if the guest attempts
1256 	 * any access where the hypervisor lives a #gp fault is generated.
1257 	 * The problem is that some applications expect a full 4GB flat
1258 	 * segment for their current thread pointer and will use negative
1259 	 * offset segment wrap around to access data. TLS support in linux
1260 	 * brand is one example of this.
1261 	 *
1262 	 * The 32-bit hypervisor can catch the #gp fault in these cases
1263 	 * and emulate the access without passing the #gp fault to the guest
1264 	 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
1265 	 * Seems like this should have been the default.
1266 	 * Either way, we want the hypervisor -- and not Solaris -- to deal
1267 	 * to deal with emulating these accesses.
1268 	 */
1269 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1270 	    VMASST_TYPE_4gb_segments) < 0)
1271 		dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
1272 #endif	/* !_BOOT_TARGET_amd64 */
1273 
1274 #else	/* __xpv */
1275 
1276 	/*
1277 	 * use cpuid to enable MMU features
1278 	 */
1279 	if (have_cpuid()) {
1280 		uint32_t eax, edx;
1281 
1282 		eax = 1;
1283 		edx = get_cpuid_edx(&eax);
1284 		if (edx & CPUID_INTC_EDX_PSE)
1285 			largepage_support = 1;
1286 		if (edx & CPUID_INTC_EDX_PGE)
1287 			pge_support = 1;
1288 		if (edx & CPUID_INTC_EDX_PAE)
1289 			pae_support = 1;
1290 
1291 		eax = 0x80000000;
1292 		edx = get_cpuid_edx(&eax);
1293 		if (eax >= 0x80000001) {
1294 			eax = 0x80000001;
1295 			edx = get_cpuid_edx(&eax);
1296 			if (edx & CPUID_AMD_EDX_LM)
1297 				amd64_support = 1;
1298 			if (edx & CPUID_AMD_EDX_NX)
1299 				NX_support = 1;
1300 		}
1301 	} else {
1302 		dboot_printf("cpuid not supported\n");
1303 	}
1304 #endif /* __xpv */
1305 
1306 
1307 #if defined(_BOOT_TARGET_amd64)
1308 	if (amd64_support == 0)
1309 		dboot_panic("long mode not supported, rebooting");
1310 	else if (pae_support == 0)
1311 		dboot_panic("long mode, but no PAE; rebooting");
1312 #else
1313 	/*
1314 	 * Allow the command line to over-ride use of PAE for 32 bit.
1315 	 */
1316 	if (strstr(cmdline, "disablePAE=true") != NULL) {
1317 		pae_support = 0;
1318 		NX_support = 0;
1319 		amd64_support = 0;
1320 	}
1321 #endif
1322 
1323 	/*
1324 	 * initialize the simple memory allocator
1325 	 */
1326 	init_mem_alloc();
1327 
1328 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
1329 	/*
1330 	 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
1331 	 */
1332 	if (max_mem < FOUR_GIG && NX_support == 0)
1333 		pae_support = 0;
1334 #endif
1335 
1336 	/*
1337 	 * configure mmu information
1338 	 */
1339 	if (pae_support) {
1340 		shift_amt = shift_amt_pae;
1341 		ptes_per_table = 512;
1342 		pte_size = 8;
1343 		lpagesize = TWO_MEG;
1344 #if defined(_BOOT_TARGET_amd64)
1345 		top_level = 3;
1346 #else
1347 		top_level = 2;
1348 #endif
1349 	} else {
1350 		pae_support = 0;
1351 		NX_support = 0;
1352 		shift_amt = shift_amt_nopae;
1353 		ptes_per_table = 1024;
1354 		pte_size = 4;
1355 		lpagesize = FOUR_MEG;
1356 		top_level = 1;
1357 	}
1358 
1359 	DBG(pge_support);
1360 	DBG(NX_support);
1361 	DBG(largepage_support);
1362 	DBG(amd64_support);
1363 	DBG(top_level);
1364 	DBG(pte_size);
1365 	DBG(ptes_per_table);
1366 	DBG(lpagesize);
1367 
1368 #if defined(__xpv)
1369 	ktext_phys = ONE_GIG;		/* from UNIX Mapfile */
1370 #else
1371 	ktext_phys = FOUR_MEG;		/* from UNIX Mapfile */
1372 #endif
1373 
1374 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
1375 	/*
1376 	 * For grub, copy kernel bits from the ELF64 file to final place.
1377 	 */
1378 	DBG_MSG("\nAllocating nucleus pages.\n");
1379 	ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
1380 	if (ktext_phys == 0)
1381 		dboot_panic("failed to allocate aligned kernel memory");
1382 	if (dboot_elfload64(mb_header.load_addr) != 0)
1383 		dboot_panic("failed to parse kernel ELF image, rebooting");
1384 #endif
1385 
1386 	DBG(ktext_phys);
1387 
1388 	/*
1389 	 * Allocate page tables.
1390 	 */
1391 	build_page_tables();
1392 
1393 	/*
1394 	 * return to assembly code to switch to running kernel
1395 	 */
1396 	entry_addr_low = (uint32_t)target_kernel_text;
1397 	DBG(entry_addr_low);
1398 	bi->bi_use_largepage = largepage_support;
1399 	bi->bi_use_pae = pae_support;
1400 	bi->bi_use_pge = pge_support;
1401 	bi->bi_use_nx = NX_support;
1402 
1403 #if defined(__xpv)
1404 
1405 	bi->bi_next_paddr = next_avail_addr - mfn_base;
1406 	DBG(bi->bi_next_paddr);
1407 	bi->bi_next_vaddr = (native_ptr_t)next_avail_addr;
1408 	DBG(bi->bi_next_vaddr);
1409 
1410 	/*
1411 	 * unmap unused pages in start area to make them available for DMA
1412 	 */
1413 	while (next_avail_addr < scratch_end) {
1414 		(void) HYPERVISOR_update_va_mapping(next_avail_addr,
1415 		    0, UVMF_INVLPG | UVMF_LOCAL);
1416 		next_avail_addr += MMU_PAGESIZE;
1417 	}
1418 
1419 	bi->bi_xen_start_info = (uintptr_t)xen_info;
1420 	DBG((uintptr_t)HYPERVISOR_shared_info);
1421 	bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
1422 	bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
1423 
1424 #else /* __xpv */
1425 
1426 	bi->bi_next_paddr = next_avail_addr;
1427 	DBG(bi->bi_next_paddr);
1428 	bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
1429 	DBG(bi->bi_next_vaddr);
1430 	bi->bi_mb_info = (uintptr_t)mb_info;
1431 	bi->bi_top_page_table = (uintptr_t)top_page_table;
1432 
1433 #endif /* __xpv */
1434 
1435 	bi->bi_kseg_size = FOUR_MEG;
1436 	DBG(bi->bi_kseg_size);
1437 
1438 #ifndef __xpv
1439 	if (map_debug)
1440 		dump_tables();
1441 #endif
1442 
1443 	DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
1444 }
1445