xref: /illumos-gate/usr/src/uts/i86pc/dboot/dboot_startkern.c (revision 28b6fd27d5ff75fe6fdeb119a21575b0652a7e70)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright 2013 Joyent, Inc.  All rights reserved.
27  */
28 
29 
30 #include <sys/types.h>
31 #include <sys/machparam.h>
32 #include <sys/x86_archext.h>
33 #include <sys/systm.h>
34 #include <sys/mach_mmu.h>
35 #include <sys/multiboot.h>
36 #include <sys/sha1.h>
37 #include <util/string.h>
38 #include <util/strtolctype.h>
39 
40 #if defined(__xpv)
41 
42 #include <sys/hypervisor.h>
43 uintptr_t xen_virt_start;
44 pfn_t *mfn_to_pfn_mapping;
45 
46 #else /* !__xpv */
47 
48 extern multiboot_header_t mb_header;
49 extern int have_cpuid(void);
50 
51 #endif /* !__xpv */
52 
53 #include <sys/inttypes.h>
54 #include <sys/bootinfo.h>
55 #include <sys/mach_mmu.h>
56 #include <sys/boot_console.h>
57 
58 #include "dboot_asm.h"
59 #include "dboot_printf.h"
60 #include "dboot_xboot.h"
61 #include "dboot_elfload.h"
62 
63 #define	SHA1_ASCII_LENGTH	(SHA1_DIGEST_LENGTH * 2)
64 
65 /*
66  * This file contains code that runs to transition us from either a multiboot
67  * compliant loader (32 bit non-paging) or a XPV domain loader to
68  * regular kernel execution. Its task is to setup the kernel memory image
69  * and page tables.
70  *
71  * The code executes as:
72  *	- 32 bits under GRUB (for 32 or 64 bit Solaris)
73  * 	- a 32 bit program for the 32-bit PV hypervisor
74  *	- a 64 bit program for the 64-bit PV hypervisor (at least for now)
75  *
76  * Under the PV hypervisor, we must create mappings for any memory beyond the
77  * initial start of day allocation (such as the kernel itself).
78  *
79  * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
80  * Since we are running in real mode, so all such memory is accessible.
81  */
82 
83 /*
84  * Standard bits used in PTE (page level) and PTP (internal levels)
85  */
86 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
87 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
88 
89 /*
90  * This is the target addresses (physical) where the kernel text and data
91  * nucleus pages will be unpacked. On the hypervisor this is actually a
92  * virtual address.
93  */
94 paddr_t ktext_phys;
95 uint32_t ksize = 2 * FOUR_MEG;	/* kernel nucleus is 8Meg */
96 
97 static uint64_t target_kernel_text;	/* value to use for KERNEL_TEXT */
98 
99 /*
100  * The stack is setup in assembler before entering startup_kernel()
101  */
102 char stack_space[STACK_SIZE];
103 
104 /*
105  * Used to track physical memory allocation
106  */
107 static paddr_t next_avail_addr = 0;
108 
109 #if defined(__xpv)
110 /*
111  * Additional information needed for hypervisor memory allocation.
112  * Only memory up to scratch_end is mapped by page tables.
113  * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
114  * to derive a pfn from a pointer, you subtract mfn_base.
115  */
116 
117 static paddr_t scratch_end = 0;	/* we can't write all of mem here */
118 static paddr_t mfn_base;		/* addr corresponding to mfn_list[0] */
119 start_info_t *xen_info;
120 
121 #else	/* __xpv */
122 
123 /*
124  * If on the metal, then we have a multiboot loader.
125  */
126 multiboot_info_t *mb_info;
127 
128 #endif	/* __xpv */
129 
130 /*
131  * This contains information passed to the kernel
132  */
133 struct xboot_info boot_info[2];	/* extra space to fix alignement for amd64 */
134 struct xboot_info *bi;
135 
136 /*
137  * Page table and memory stuff.
138  */
139 static paddr_t max_mem;			/* maximum memory address */
140 
141 /*
142  * Information about processor MMU
143  */
144 int amd64_support = 0;
145 int largepage_support = 0;
146 int pae_support = 0;
147 int pge_support = 0;
148 int NX_support = 0;
149 
150 /*
151  * Low 32 bits of kernel entry address passed back to assembler.
152  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
153  */
154 uint32_t entry_addr_low;
155 
156 /*
157  * Memlists for the kernel. We shouldn't need a lot of these.
158  */
159 #define	MAX_MEMLIST (50)
160 struct boot_memlist memlists[MAX_MEMLIST];
161 uint_t memlists_used = 0;
162 struct boot_memlist pcimemlists[MAX_MEMLIST];
163 uint_t pcimemlists_used = 0;
164 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
165 uint_t rsvdmemlists_used = 0;
166 
167 /*
168  * This should match what's in the bootloader.  It's arbitrary, but GRUB
169  * in particular has limitations on how much space it can use before it
170  * stops working properly.  This should be enough.
171  */
172 struct boot_modules modules[MAX_BOOT_MODULES];
173 uint_t modules_used = 0;
174 
175 /*
176  * Debugging macros
177  */
178 uint_t prom_debug = 0;
179 uint_t map_debug = 0;
180 
181 static char noname[2] = "-";
182 
183 /*
184  * Either hypervisor-specific or grub-specific code builds the initial
185  * memlists. This code does the sort/merge/link for final use.
186  */
187 static void
188 sort_physinstall(void)
189 {
190 	int i;
191 #if !defined(__xpv)
192 	int j;
193 	struct boot_memlist tmp;
194 
195 	/*
196 	 * Now sort the memlists, in case they weren't in order.
197 	 * Yeah, this is a bubble sort; small, simple and easy to get right.
198 	 */
199 	DBG_MSG("Sorting phys-installed list\n");
200 	for (j = memlists_used - 1; j > 0; --j) {
201 		for (i = 0; i < j; ++i) {
202 			if (memlists[i].addr < memlists[i + 1].addr)
203 				continue;
204 			tmp = memlists[i];
205 			memlists[i] = memlists[i + 1];
206 			memlists[i + 1] = tmp;
207 		}
208 	}
209 
210 	/*
211 	 * Merge any memlists that don't have holes between them.
212 	 */
213 	for (i = 0; i <= memlists_used - 1; ++i) {
214 		if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
215 			continue;
216 
217 		if (prom_debug)
218 			dboot_printf(
219 			    "merging mem segs %" PRIx64 "...%" PRIx64
220 			    " w/ %" PRIx64 "...%" PRIx64 "\n",
221 			    memlists[i].addr,
222 			    memlists[i].addr + memlists[i].size,
223 			    memlists[i + 1].addr,
224 			    memlists[i + 1].addr + memlists[i + 1].size);
225 
226 		memlists[i].size += memlists[i + 1].size;
227 		for (j = i + 1; j < memlists_used - 1; ++j)
228 			memlists[j] = memlists[j + 1];
229 		--memlists_used;
230 		DBG(memlists_used);
231 		--i;	/* after merging we need to reexamine, so do this */
232 	}
233 #endif	/* __xpv */
234 
235 	if (prom_debug) {
236 		dboot_printf("\nFinal memlists:\n");
237 		for (i = 0; i < memlists_used; ++i) {
238 			dboot_printf("\t%d: addr=%" PRIx64 " size=%"
239 			    PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
240 		}
241 	}
242 
243 	/*
244 	 * link together the memlists with native size pointers
245 	 */
246 	memlists[0].next = 0;
247 	memlists[0].prev = 0;
248 	for (i = 1; i < memlists_used; ++i) {
249 		memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
250 		memlists[i].next = 0;
251 		memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
252 	}
253 	bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
254 	DBG(bi->bi_phys_install);
255 }
256 
257 /*
258  * build bios reserved memlists
259  */
260 static void
261 build_rsvdmemlists(void)
262 {
263 	int i;
264 
265 	rsvdmemlists[0].next = 0;
266 	rsvdmemlists[0].prev = 0;
267 	for (i = 1; i < rsvdmemlists_used; ++i) {
268 		rsvdmemlists[i].prev =
269 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
270 		rsvdmemlists[i].next = 0;
271 		rsvdmemlists[i - 1].next =
272 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
273 	}
274 	bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
275 	DBG(bi->bi_rsvdmem);
276 }
277 
278 #if defined(__xpv)
279 
280 /*
281  * halt on the hypervisor after a delay to drain console output
282  */
283 void
284 dboot_halt(void)
285 {
286 	uint_t i = 10000;
287 
288 	while (--i)
289 		(void) HYPERVISOR_yield();
290 	(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
291 }
292 
293 /*
294  * From a machine address, find the corresponding pseudo-physical address.
295  * Pseudo-physical address are contiguous and run from mfn_base in each VM.
296  * Machine addresses are the real underlying hardware addresses.
297  * These are needed for page table entries. Note that this routine is
298  * poorly protected. A bad value of "ma" will cause a page fault.
299  */
300 paddr_t
301 ma_to_pa(maddr_t ma)
302 {
303 	ulong_t pgoff = ma & MMU_PAGEOFFSET;
304 	ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
305 	paddr_t pa;
306 
307 	if (pfn >= xen_info->nr_pages)
308 		return (-(paddr_t)1);
309 	pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
310 #ifdef DEBUG
311 	if (ma != pa_to_ma(pa))
312 		dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
313 		    "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
314 #endif
315 	return (pa);
316 }
317 
318 /*
319  * From a pseudo-physical address, find the corresponding machine address.
320  */
321 maddr_t
322 pa_to_ma(paddr_t pa)
323 {
324 	pfn_t pfn;
325 	ulong_t mfn;
326 
327 	pfn = mmu_btop(pa - mfn_base);
328 	if (pa < mfn_base || pfn >= xen_info->nr_pages)
329 		dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
330 	mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
331 #ifdef DEBUG
332 	if (mfn_to_pfn_mapping[mfn] != pfn)
333 		dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
334 		    pfn, mfn, mfn_to_pfn_mapping[mfn]);
335 #endif
336 	return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
337 }
338 
339 #endif	/* __xpv */
340 
341 x86pte_t
342 get_pteval(paddr_t table, uint_t index)
343 {
344 	if (pae_support)
345 		return (((x86pte_t *)(uintptr_t)table)[index]);
346 	return (((x86pte32_t *)(uintptr_t)table)[index]);
347 }
348 
349 /*ARGSUSED*/
350 void
351 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
352 {
353 #ifdef __xpv
354 	mmu_update_t t;
355 	maddr_t mtable = pa_to_ma(table);
356 	int retcnt;
357 
358 	t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
359 	t.val = pteval;
360 	if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
361 		dboot_panic("HYPERVISOR_mmu_update() failed");
362 #else /* __xpv */
363 	uintptr_t tab_addr = (uintptr_t)table;
364 
365 	if (pae_support)
366 		((x86pte_t *)tab_addr)[index] = pteval;
367 	else
368 		((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
369 	if (level == top_level && level == 2)
370 		reload_cr3();
371 #endif /* __xpv */
372 }
373 
374 paddr_t
375 make_ptable(x86pte_t *pteval, uint_t level)
376 {
377 	paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
378 
379 	if (level == top_level && level == 2)
380 		*pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
381 	else
382 		*pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
383 
384 #ifdef __xpv
385 	/* Remove write permission to the new page table. */
386 	if (HYPERVISOR_update_va_mapping(new_table,
387 	    *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
388 		dboot_panic("HYP_update_va_mapping error");
389 #endif
390 
391 	if (map_debug)
392 		dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
393 		    PRIx64 "\n", level, (ulong_t)new_table, *pteval);
394 	return (new_table);
395 }
396 
397 x86pte_t *
398 map_pte(paddr_t table, uint_t index)
399 {
400 	return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
401 }
402 
403 /*
404  * dump out the contents of page tables...
405  */
406 static void
407 dump_tables(void)
408 {
409 	uint_t save_index[4];	/* for recursion */
410 	char *save_table[4];	/* for recursion */
411 	uint_t	l;
412 	uint64_t va;
413 	uint64_t pgsize;
414 	int index;
415 	int i;
416 	x86pte_t pteval;
417 	char *table;
418 	static char *tablist = "\t\t\t";
419 	char *tabs = tablist + 3 - top_level;
420 	uint_t pa, pa1;
421 #if !defined(__xpv)
422 #define	maddr_t paddr_t
423 #endif /* !__xpv */
424 
425 	dboot_printf("Finished pagetables:\n");
426 	table = (char *)(uintptr_t)top_page_table;
427 	l = top_level;
428 	va = 0;
429 	for (index = 0; index < ptes_per_table; ++index) {
430 		pgsize = 1ull << shift_amt[l];
431 		if (pae_support)
432 			pteval = ((x86pte_t *)table)[index];
433 		else
434 			pteval = ((x86pte32_t *)table)[index];
435 		if (pteval == 0)
436 			goto next_entry;
437 
438 		dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
439 		    tabs + l, (void *)table, index, (uint64_t)pteval, va);
440 		pa = ma_to_pa(pteval & MMU_PAGEMASK);
441 		dboot_printf(" physaddr=%x\n", pa);
442 
443 		/*
444 		 * Don't try to walk hypervisor private pagetables
445 		 */
446 		if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
447 			save_table[l] = table;
448 			save_index[l] = index;
449 			--l;
450 			index = -1;
451 			table = (char *)(uintptr_t)
452 			    ma_to_pa(pteval & MMU_PAGEMASK);
453 			goto recursion;
454 		}
455 
456 		/*
457 		 * shorten dump for consecutive mappings
458 		 */
459 		for (i = 1; index + i < ptes_per_table; ++i) {
460 			if (pae_support)
461 				pteval = ((x86pte_t *)table)[index + i];
462 			else
463 				pteval = ((x86pte32_t *)table)[index + i];
464 			if (pteval == 0)
465 				break;
466 			pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
467 			if (pa1 != pa + i * pgsize)
468 				break;
469 		}
470 		if (i > 2) {
471 			dboot_printf("%s...\n", tabs + l);
472 			va += pgsize * (i - 2);
473 			index += i - 2;
474 		}
475 next_entry:
476 		va += pgsize;
477 		if (l == 3 && index == 256)	/* VA hole */
478 			va = 0xffff800000000000ull;
479 recursion:
480 		;
481 	}
482 	if (l < top_level) {
483 		++l;
484 		index = save_index[l];
485 		table = save_table[l];
486 		goto recursion;
487 	}
488 }
489 
490 /*
491  * Add a mapping for the machine page at the given virtual address.
492  */
493 static void
494 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
495 {
496 	x86pte_t *ptep;
497 	x86pte_t pteval;
498 
499 	pteval = ma | pte_bits;
500 	if (level > 0)
501 		pteval |= PT_PAGESIZE;
502 	if (va >= target_kernel_text && pge_support)
503 		pteval |= PT_GLOBAL;
504 
505 	if (map_debug && ma != va)
506 		dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
507 		    " pte=0x%" PRIx64 " l=%d\n",
508 		    (uint64_t)ma, (uint64_t)va, pteval, level);
509 
510 #if defined(__xpv)
511 	/*
512 	 * see if we can avoid find_pte() on the hypervisor
513 	 */
514 	if (HYPERVISOR_update_va_mapping(va, pteval,
515 	    UVMF_INVLPG | UVMF_LOCAL) == 0)
516 		return;
517 #endif
518 
519 	/*
520 	 * Find the pte that will map this address. This creates any
521 	 * missing intermediate level page tables
522 	 */
523 	ptep = find_pte(va, NULL, level, 0);
524 
525 	/*
526 	 * When paravirtualized, we must use hypervisor calls to modify the
527 	 * PTE, since paging is active. On real hardware we just write to
528 	 * the pagetables which aren't in use yet.
529 	 */
530 #if defined(__xpv)
531 	ptep = ptep;	/* shut lint up */
532 	if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
533 		dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
534 		    " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
535 		    (uint64_t)va, level, (uint64_t)ma, pteval);
536 #else
537 	if (va < 1024 * 1024)
538 		pteval |= PT_NOCACHE;		/* for video RAM */
539 	if (pae_support)
540 		*ptep = pteval;
541 	else
542 		*((x86pte32_t *)ptep) = (x86pte32_t)pteval;
543 #endif
544 }
545 
546 /*
547  * Add a mapping for the physical page at the given virtual address.
548  */
549 static void
550 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
551 {
552 	map_ma_at_va(pa_to_ma(pa), va, level);
553 }
554 
555 /*
556  * This is called to remove start..end from the
557  * possible range of PCI addresses.
558  */
559 const uint64_t pci_lo_limit = 0x00100000ul;
560 const uint64_t pci_hi_limit = 0xfff00000ul;
561 static void
562 exclude_from_pci(uint64_t start, uint64_t end)
563 {
564 	int i;
565 	int j;
566 	struct boot_memlist *ml;
567 
568 	for (i = 0; i < pcimemlists_used; ++i) {
569 		ml = &pcimemlists[i];
570 
571 		/* delete the entire range? */
572 		if (start <= ml->addr && ml->addr + ml->size <= end) {
573 			--pcimemlists_used;
574 			for (j = i; j < pcimemlists_used; ++j)
575 				pcimemlists[j] = pcimemlists[j + 1];
576 			--i;	/* to revisit the new one at this index */
577 		}
578 
579 		/* split a range? */
580 		else if (ml->addr < start && end < ml->addr + ml->size) {
581 
582 			++pcimemlists_used;
583 			if (pcimemlists_used > MAX_MEMLIST)
584 				dboot_panic("too many pcimemlists");
585 
586 			for (j = pcimemlists_used - 1; j > i; --j)
587 				pcimemlists[j] = pcimemlists[j - 1];
588 			ml->size = start - ml->addr;
589 
590 			++ml;
591 			ml->size = (ml->addr + ml->size) - end;
592 			ml->addr = end;
593 			++i;	/* skip on to next one */
594 		}
595 
596 		/* cut memory off the start? */
597 		else if (ml->addr < end && end < ml->addr + ml->size) {
598 			ml->size -= end - ml->addr;
599 			ml->addr = end;
600 		}
601 
602 		/* cut memory off the end? */
603 		else if (ml->addr <= start && start < ml->addr + ml->size) {
604 			ml->size = start - ml->addr;
605 		}
606 	}
607 }
608 
609 /*
610  * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
611  * definition in Xen source.
612  */
613 #ifdef __xpv
614 typedef struct {
615 	uint32_t	base_addr_low;
616 	uint32_t	base_addr_high;
617 	uint32_t	length_low;
618 	uint32_t	length_high;
619 	uint32_t	type;
620 } mmap_t;
621 #else
622 typedef mb_memory_map_t mmap_t;
623 #endif
624 
625 static void
626 build_pcimemlists(mmap_t *mem, int num)
627 {
628 	mmap_t *mmap;
629 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
630 	uint64_t start;
631 	uint64_t end;
632 	int i;
633 
634 	/*
635 	 * initialize
636 	 */
637 	pcimemlists[0].addr = pci_lo_limit;
638 	pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
639 	pcimemlists_used = 1;
640 
641 	/*
642 	 * Fill in PCI memlists.
643 	 */
644 	for (mmap = mem, i = 0; i < num; ++i, ++mmap) {
645 		start = ((uint64_t)mmap->base_addr_high << 32) +
646 		    mmap->base_addr_low;
647 		end = start + ((uint64_t)mmap->length_high << 32) +
648 		    mmap->length_low;
649 
650 		if (prom_debug)
651 			dboot_printf("\ttype: %d %" PRIx64 "..%"
652 			    PRIx64 "\n", mmap->type, start, end);
653 
654 		/*
655 		 * page align start and end
656 		 */
657 		start = (start + page_offset) & ~page_offset;
658 		end &= ~page_offset;
659 		if (end <= start)
660 			continue;
661 
662 		exclude_from_pci(start, end);
663 	}
664 
665 	/*
666 	 * Finish off the pcimemlist
667 	 */
668 	if (prom_debug) {
669 		for (i = 0; i < pcimemlists_used; ++i) {
670 			dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
671 			    PRIx64 "\n", pcimemlists[i].addr,
672 			    pcimemlists[i].addr + pcimemlists[i].size);
673 		}
674 	}
675 	pcimemlists[0].next = 0;
676 	pcimemlists[0].prev = 0;
677 	for (i = 1; i < pcimemlists_used; ++i) {
678 		pcimemlists[i].prev =
679 		    (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
680 		pcimemlists[i].next = 0;
681 		pcimemlists[i - 1].next =
682 		    (native_ptr_t)(uintptr_t)(pcimemlists + i);
683 	}
684 	bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
685 	DBG(bi->bi_pcimem);
686 }
687 
688 #if defined(__xpv)
689 /*
690  * Initialize memory allocator stuff from hypervisor-supplied start info.
691  *
692  * There is 512KB of scratch area after the boot stack page.
693  * We'll use that for everything except the kernel nucleus pages which are too
694  * big to fit there and are allocated last anyway.
695  */
696 #define	MAXMAPS	100
697 static mmap_t map_buffer[MAXMAPS];
698 static void
699 init_mem_alloc(void)
700 {
701 	int	local;	/* variables needed to find start region */
702 	paddr_t	scratch_start;
703 	xen_memory_map_t map;
704 
705 	DBG_MSG("Entered init_mem_alloc()\n");
706 
707 	/*
708 	 * Free memory follows the stack. There's at least 512KB of scratch
709 	 * space, rounded up to at least 2Mb alignment.  That should be enough
710 	 * for the page tables we'll need to build.  The nucleus memory is
711 	 * allocated last and will be outside the addressible range.  We'll
712 	 * switch to new page tables before we unpack the kernel
713 	 */
714 	scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
715 	DBG(scratch_start);
716 	scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
717 	DBG(scratch_end);
718 
719 	/*
720 	 * For paranoia, leave some space between hypervisor data and ours.
721 	 * Use 500 instead of 512.
722 	 */
723 	next_avail_addr = scratch_end - 500 * 1024;
724 	DBG(next_avail_addr);
725 
726 	/*
727 	 * The domain builder gives us at most 1 module
728 	 */
729 	DBG(xen_info->mod_len);
730 	if (xen_info->mod_len > 0) {
731 		DBG(xen_info->mod_start);
732 		modules[0].bm_addr = xen_info->mod_start;
733 		modules[0].bm_size = xen_info->mod_len;
734 		bi->bi_module_cnt = 1;
735 		bi->bi_modules = (native_ptr_t)modules;
736 	} else {
737 		bi->bi_module_cnt = 0;
738 		bi->bi_modules = NULL;
739 	}
740 	DBG(bi->bi_module_cnt);
741 	DBG(bi->bi_modules);
742 
743 	DBG(xen_info->mfn_list);
744 	DBG(xen_info->nr_pages);
745 	max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
746 	DBG(max_mem);
747 
748 	/*
749 	 * Using pseudo-physical addresses, so only 1 memlist element
750 	 */
751 	memlists[0].addr = 0;
752 	DBG(memlists[0].addr);
753 	memlists[0].size = max_mem;
754 	DBG(memlists[0].size);
755 	memlists_used = 1;
756 	DBG(memlists_used);
757 
758 	/*
759 	 * finish building physinstall list
760 	 */
761 	sort_physinstall();
762 
763 	/*
764 	 * build bios reserved memlists
765 	 */
766 	build_rsvdmemlists();
767 
768 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
769 		/*
770 		 * build PCI Memory list
771 		 */
772 		map.nr_entries = MAXMAPS;
773 		/*LINTED: constant in conditional context*/
774 		set_xen_guest_handle(map.buffer, map_buffer);
775 		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
776 			dboot_panic("getting XENMEM_machine_memory_map failed");
777 		build_pcimemlists(map_buffer, map.nr_entries);
778 	}
779 }
780 
781 #else	/* !__xpv */
782 
783 static uint8_t
784 dboot_a2h(char v)
785 {
786 	if (v >= 'a')
787 		return (v - 'a' + 0xa);
788 	else if (v >= 'A')
789 		return (v - 'A' + 0xa);
790 	else if (v >= '0')
791 		return (v - '0');
792 	else
793 		dboot_panic("bad ASCII hex character %c\n", v);
794 
795 	return (0);
796 }
797 
798 static void
799 digest_a2h(const char *ascii, uint8_t *digest)
800 {
801 	unsigned int i;
802 
803 	for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
804 		digest[i] = dboot_a2h(ascii[i * 2]) << 4;
805 		digest[i] |= dboot_a2h(ascii[i * 2 + 1]);
806 	}
807 }
808 
809 /*
810  * Generate a SHA-1 hash of the first len bytes of image, and compare it with
811  * the ASCII-format hash found in the 40-byte buffer at ascii.  If they
812  * match, return 0, otherwise -1.  This works only for images smaller than
813  * 4 GB, which should not be a problem.
814  */
815 static int
816 check_image_hash(uint_t midx)
817 {
818 	const char *ascii;
819 	const void *image;
820 	size_t len;
821 	SHA1_CTX ctx;
822 	uint8_t digest[SHA1_DIGEST_LENGTH];
823 	uint8_t baseline[SHA1_DIGEST_LENGTH];
824 	unsigned int i;
825 
826 	ascii = (const char *)(uintptr_t)modules[midx].bm_hash;
827 	image = (const void *)(uintptr_t)modules[midx].bm_addr;
828 	len = (size_t)modules[midx].bm_size;
829 
830 	digest_a2h(ascii, baseline);
831 
832 	SHA1Init(&ctx);
833 	SHA1Update(&ctx, image, len);
834 	SHA1Final(digest, &ctx);
835 
836 	for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
837 		if (digest[i] != baseline[i])
838 			return (-1);
839 	}
840 
841 	return (0);
842 }
843 
844 static const char *
845 type_to_str(boot_module_type_t type)
846 {
847 	switch (type) {
848 	case BMT_ROOTFS:
849 		return ("rootfs");
850 	case BMT_FILE:
851 		return ("file");
852 	case BMT_HASH:
853 		return ("hash");
854 	default:
855 		return ("unknown");
856 	}
857 }
858 
859 static void
860 check_images(void)
861 {
862 	uint_t i;
863 	char displayhash[SHA1_ASCII_LENGTH + 1];
864 
865 	for (i = 0; i < modules_used; i++) {
866 		if (prom_debug) {
867 			dboot_printf("module #%d: name %s type %s "
868 			    "addr %lx size %lx\n",
869 			    i, (char *)(uintptr_t)modules[i].bm_name,
870 			    type_to_str(modules[i].bm_type),
871 			    (ulong_t)modules[i].bm_addr,
872 			    (ulong_t)modules[i].bm_size);
873 		}
874 
875 		if (modules[i].bm_type == BMT_HASH ||
876 		    modules[i].bm_hash == NULL) {
877 			DBG_MSG("module has no hash; skipping check\n");
878 			continue;
879 		}
880 		(void) memcpy(displayhash,
881 		    (void *)(uintptr_t)modules[i].bm_hash,
882 		    SHA1_ASCII_LENGTH);
883 		displayhash[SHA1_ASCII_LENGTH] = '\0';
884 		if (prom_debug) {
885 			dboot_printf("checking expected hash [%s]: ",
886 			    displayhash);
887 		}
888 
889 		if (check_image_hash(i) != 0)
890 			dboot_panic("hash mismatch!\n");
891 		else
892 			DBG_MSG("OK\n");
893 	}
894 }
895 
896 /*
897  * Determine the module's starting address, size, name, and type, and fill the
898  * boot_modules structure.  This structure is used by the bop code, except for
899  * hashes which are checked prior to transferring control to the kernel.
900  */
901 static void
902 process_module(mb_module_t *mod)
903 {
904 	int midx = modules_used++;
905 	char *p, *q;
906 
907 	if (prom_debug) {
908 		dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n",
909 		    midx, (char *)(mod->mod_name),
910 		    (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
911 	}
912 
913 	if (mod->mod_start > mod->mod_end) {
914 		dboot_panic("module #%d: module start address 0x%lx greater "
915 		    "than end address 0x%lx", midx,
916 		    (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
917 	}
918 
919 	/*
920 	 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
921 	 * the address of the last valid byte in a module plus 1 as mod_end.
922 	 * This is of course a bug; the multiboot specification simply states
923 	 * that mod_start and mod_end "contain the start and end addresses of
924 	 * the boot module itself" which is pretty obviously not what GRUB is
925 	 * doing.  However, fixing it requires that not only this code be
926 	 * changed but also that other code consuming this value and values
927 	 * derived from it be fixed, and that the kernel and GRUB must either
928 	 * both have the bug or neither.  While there are a lot of combinations
929 	 * that will work, there are also some that won't, so for simplicity
930 	 * we'll just cope with the bug.  That means we won't actually hash the
931 	 * byte at mod_end, and we will expect that mod_end for the hash file
932 	 * itself is one greater than some multiple of 41 (40 bytes of ASCII
933 	 * hash plus a newline for each module).  We set bm_size to the true
934 	 * correct number of bytes in each module, achieving exactly this.
935 	 */
936 
937 	modules[midx].bm_addr = mod->mod_start;
938 	modules[midx].bm_size = mod->mod_end - mod->mod_start;
939 	modules[midx].bm_name = mod->mod_name;
940 	modules[midx].bm_hash = NULL;
941 	modules[midx].bm_type = BMT_FILE;
942 
943 	if (mod->mod_name == NULL) {
944 		modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname;
945 		return;
946 	}
947 
948 	p = (char *)(uintptr_t)mod->mod_name;
949 	modules[midx].bm_name =
950 	    (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r");
951 
952 	while (p != NULL) {
953 		q = strsep(&p, " \t\f\n\r");
954 		if (strncmp(q, "name=", 5) == 0) {
955 			if (q[5] != '\0' && !isspace(q[5])) {
956 				modules[midx].bm_name =
957 				    (native_ptr_t)(uintptr_t)(q + 5);
958 			}
959 			continue;
960 		}
961 
962 		if (strncmp(q, "type=", 5) == 0) {
963 			if (q[5] == '\0' || isspace(q[5]))
964 				continue;
965 			q += 5;
966 			if (strcmp(q, "rootfs") == 0) {
967 				modules[midx].bm_type = BMT_ROOTFS;
968 			} else if (strcmp(q, "hash") == 0) {
969 				modules[midx].bm_type = BMT_HASH;
970 			} else if (strcmp(q, "file") != 0) {
971 				dboot_printf("\tmodule #%d: unknown module "
972 				    "type '%s'; defaulting to 'file'",
973 				    midx, q);
974 			}
975 			continue;
976 		}
977 
978 		if (strncmp(q, "hash=", 5) == 0) {
979 			if (q[5] != '\0' && !isspace(q[5])) {
980 				modules[midx].bm_hash =
981 				    (native_ptr_t)(uintptr_t)(q + 5);
982 			}
983 			continue;
984 		}
985 
986 		dboot_printf("ignoring unknown option '%s'\n", q);
987 	}
988 }
989 
990 /*
991  * Backward compatibility: if there are exactly one or two modules, both
992  * of type 'file' and neither with an embedded hash value, we have been
993  * given the legacy style modules.  In this case we need to treat the first
994  * module as a rootfs and the second as a hash referencing that module.
995  * Otherwise, even if the configuration is invalid, we assume that the
996  * operator knows what he's doing or at least isn't being bitten by this
997  * interface change.
998  */
999 static void
1000 fixup_modules(void)
1001 {
1002 	if (modules_used == 0 || modules_used > 2)
1003 		return;
1004 
1005 	if (modules[0].bm_type != BMT_FILE ||
1006 	    modules_used > 1 && modules[1].bm_type != BMT_FILE) {
1007 		return;
1008 	}
1009 
1010 	if (modules[0].bm_hash != NULL ||
1011 	    modules_used > 1 && modules[1].bm_hash != NULL) {
1012 		return;
1013 	}
1014 
1015 	modules[0].bm_type = BMT_ROOTFS;
1016 	if (modules_used > 1) {
1017 		modules[1].bm_type = BMT_HASH;
1018 		modules[1].bm_name = modules[0].bm_name;
1019 	}
1020 }
1021 
1022 /*
1023  * For modules that do not have assigned hashes but have a separate hash module,
1024  * find the assigned hash module and set the primary module's bm_hash to point
1025  * to the hash data from that module.  We will then ignore modules of type
1026  * BMT_HASH from this point forward.
1027  */
1028 static void
1029 assign_module_hashes(void)
1030 {
1031 	uint_t i, j;
1032 
1033 	for (i = 0; i < modules_used; i++) {
1034 		if (modules[i].bm_type == BMT_HASH ||
1035 		    modules[i].bm_hash != NULL) {
1036 			continue;
1037 		}
1038 
1039 		for (j = 0; j < modules_used; j++) {
1040 			if (modules[j].bm_type != BMT_HASH ||
1041 			    strcmp((char *)(uintptr_t)modules[j].bm_name,
1042 			    (char *)(uintptr_t)modules[i].bm_name) != 0) {
1043 				continue;
1044 			}
1045 
1046 			if (modules[j].bm_size < SHA1_ASCII_LENGTH) {
1047 				dboot_printf("Short hash module of length "
1048 				    "0x%lx bytes; ignoring\n",
1049 				    (ulong_t)modules[j].bm_size);
1050 			} else {
1051 				modules[i].bm_hash = modules[j].bm_addr;
1052 			}
1053 			break;
1054 		}
1055 	}
1056 }
1057 
1058 /*
1059  * During memory allocation, find the highest address not used yet.
1060  */
1061 static void
1062 check_higher(paddr_t a)
1063 {
1064 	if (a < next_avail_addr)
1065 		return;
1066 	next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
1067 	DBG(next_avail_addr);
1068 }
1069 
1070 /*
1071  * Walk through the module information finding the last used address.
1072  * The first available address will become the top level page table.
1073  *
1074  * We then build the phys_install memlist from the multiboot information.
1075  */
1076 static void
1077 init_mem_alloc(void)
1078 {
1079 	mb_memory_map_t *mmap;
1080 	mb_module_t *mod;
1081 	uint64_t start;
1082 	uint64_t end;
1083 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
1084 	extern char _end[];
1085 	int i;
1086 
1087 	DBG_MSG("Entered init_mem_alloc()\n");
1088 	DBG((uintptr_t)mb_info);
1089 
1090 	if (mb_info->mods_count > MAX_BOOT_MODULES) {
1091 		dboot_panic("Too many modules (%d) -- the maximum is %d.",
1092 		    mb_info->mods_count, MAX_BOOT_MODULES);
1093 	}
1094 	/*
1095 	 * search the modules to find the last used address
1096 	 * we'll build the module list while we're walking through here
1097 	 */
1098 	DBG_MSG("\nFinding Modules\n");
1099 	check_higher((paddr_t)(uintptr_t)&_end);
1100 	for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
1101 	    i < mb_info->mods_count;
1102 	    ++mod, ++i) {
1103 		process_module(mod);
1104 		check_higher(mod->mod_end);
1105 	}
1106 	bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1107 	DBG(bi->bi_modules);
1108 	bi->bi_module_cnt = mb_info->mods_count;
1109 	DBG(bi->bi_module_cnt);
1110 
1111 	fixup_modules();
1112 	assign_module_hashes();
1113 	check_images();
1114 
1115 	/*
1116 	 * Walk through the memory map from multiboot and build our memlist
1117 	 * structures. Note these will have native format pointers.
1118 	 */
1119 	DBG_MSG("\nFinding Memory Map\n");
1120 	DBG(mb_info->flags);
1121 	max_mem = 0;
1122 	if (mb_info->flags & 0x40) {
1123 		int cnt = 0;
1124 
1125 		DBG(mb_info->mmap_addr);
1126 		DBG(mb_info->mmap_length);
1127 		check_higher(mb_info->mmap_addr + mb_info->mmap_length);
1128 
1129 		for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
1130 		    (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length;
1131 		    mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size
1132 		    + sizeof (mmap->size))) {
1133 			++cnt;
1134 			start = ((uint64_t)mmap->base_addr_high << 32) +
1135 			    mmap->base_addr_low;
1136 			end = start + ((uint64_t)mmap->length_high << 32) +
1137 			    mmap->length_low;
1138 
1139 			if (prom_debug)
1140 				dboot_printf("\ttype: %d %" PRIx64 "..%"
1141 				    PRIx64 "\n", mmap->type, start, end);
1142 
1143 			/*
1144 			 * page align start and end
1145 			 */
1146 			start = (start + page_offset) & ~page_offset;
1147 			end &= ~page_offset;
1148 			if (end <= start)
1149 				continue;
1150 
1151 			/*
1152 			 * only type 1 is usable RAM
1153 			 */
1154 			switch (mmap->type) {
1155 			case 1:
1156 				if (end > max_mem)
1157 					max_mem = end;
1158 				memlists[memlists_used].addr = start;
1159 				memlists[memlists_used].size = end - start;
1160 				++memlists_used;
1161 				if (memlists_used > MAX_MEMLIST)
1162 					dboot_panic("too many memlists");
1163 				break;
1164 			case 2:
1165 				rsvdmemlists[rsvdmemlists_used].addr = start;
1166 				rsvdmemlists[rsvdmemlists_used].size =
1167 				    end - start;
1168 				++rsvdmemlists_used;
1169 				if (rsvdmemlists_used > MAX_MEMLIST)
1170 					dboot_panic("too many rsvdmemlists");
1171 				break;
1172 			default:
1173 				continue;
1174 			}
1175 		}
1176 		build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt);
1177 	} else if (mb_info->flags & 0x01) {
1178 		DBG(mb_info->mem_lower);
1179 		memlists[memlists_used].addr = 0;
1180 		memlists[memlists_used].size = mb_info->mem_lower * 1024;
1181 		++memlists_used;
1182 		DBG(mb_info->mem_upper);
1183 		memlists[memlists_used].addr = 1024 * 1024;
1184 		memlists[memlists_used].size = mb_info->mem_upper * 1024;
1185 		++memlists_used;
1186 
1187 		/*
1188 		 * Old platform - assume I/O space at the end of memory.
1189 		 */
1190 		pcimemlists[0].addr =
1191 		    (mb_info->mem_upper * 1024) + (1024 * 1024);
1192 		pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
1193 		pcimemlists[0].next = 0;
1194 		pcimemlists[0].prev = 0;
1195 		bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
1196 		DBG(bi->bi_pcimem);
1197 	} else {
1198 		dboot_panic("No memory info from boot loader!!!");
1199 	}
1200 
1201 	check_higher(bi->bi_cmdline);
1202 
1203 	/*
1204 	 * finish processing the physinstall list
1205 	 */
1206 	sort_physinstall();
1207 
1208 	/*
1209 	 * build bios reserved mem lists
1210 	 */
1211 	build_rsvdmemlists();
1212 }
1213 #endif /* !__xpv */
1214 
1215 /*
1216  * Simple memory allocator, allocates aligned physical memory.
1217  * Note that startup_kernel() only allocates memory, never frees.
1218  * Memory usage just grows in an upward direction.
1219  */
1220 static void *
1221 do_mem_alloc(uint32_t size, uint32_t align)
1222 {
1223 	uint_t i;
1224 	uint64_t best;
1225 	uint64_t start;
1226 	uint64_t end;
1227 
1228 	/*
1229 	 * make sure size is a multiple of pagesize
1230 	 */
1231 	size = RNDUP(size, MMU_PAGESIZE);
1232 	next_avail_addr = RNDUP(next_avail_addr, align);
1233 
1234 	/*
1235 	 * XXPV fixme joe
1236 	 *
1237 	 * a really large bootarchive that causes you to run out of memory
1238 	 * may cause this to blow up
1239 	 */
1240 	/* LINTED E_UNEXPECTED_UINT_PROMOTION */
1241 	best = (uint64_t)-size;
1242 	for (i = 0; i < memlists_used; ++i) {
1243 		start = memlists[i].addr;
1244 #if defined(__xpv)
1245 		start += mfn_base;
1246 #endif
1247 		end = start + memlists[i].size;
1248 
1249 		/*
1250 		 * did we find the desired address?
1251 		 */
1252 		if (start <= next_avail_addr && next_avail_addr + size <= end) {
1253 			best = next_avail_addr;
1254 			goto done;
1255 		}
1256 
1257 		/*
1258 		 * if not is this address the best so far?
1259 		 */
1260 		if (start > next_avail_addr && start < best &&
1261 		    RNDUP(start, align) + size <= end)
1262 			best = RNDUP(start, align);
1263 	}
1264 
1265 	/*
1266 	 * We didn't find exactly the address we wanted, due to going off the
1267 	 * end of a memory region. Return the best found memory address.
1268 	 */
1269 done:
1270 	next_avail_addr = best + size;
1271 #if defined(__xpv)
1272 	if (next_avail_addr > scratch_end)
1273 		dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
1274 		    "0x%lx", (ulong_t)next_avail_addr,
1275 		    (ulong_t)scratch_end);
1276 #endif
1277 	(void) memset((void *)(uintptr_t)best, 0, size);
1278 	return ((void *)(uintptr_t)best);
1279 }
1280 
1281 void *
1282 mem_alloc(uint32_t size)
1283 {
1284 	return (do_mem_alloc(size, MMU_PAGESIZE));
1285 }
1286 
1287 
1288 /*
1289  * Build page tables to map all of memory used so far as well as the kernel.
1290  */
1291 static void
1292 build_page_tables(void)
1293 {
1294 	uint32_t psize;
1295 	uint32_t level;
1296 	uint32_t off;
1297 	uint64_t start;
1298 #if !defined(__xpv)
1299 	uint32_t i;
1300 	uint64_t end;
1301 #endif	/* __xpv */
1302 
1303 	/*
1304 	 * If we're on metal, we need to create the top level pagetable.
1305 	 */
1306 #if defined(__xpv)
1307 	top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1308 #else /* __xpv */
1309 	top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1310 #endif /* __xpv */
1311 	DBG((uintptr_t)top_page_table);
1312 
1313 	/*
1314 	 * Determine if we'll use large mappings for kernel, then map it.
1315 	 */
1316 	if (largepage_support) {
1317 		psize = lpagesize;
1318 		level = 1;
1319 	} else {
1320 		psize = MMU_PAGESIZE;
1321 		level = 0;
1322 	}
1323 
1324 	DBG_MSG("Mapping kernel\n");
1325 	DBG(ktext_phys);
1326 	DBG(target_kernel_text);
1327 	DBG(ksize);
1328 	DBG(psize);
1329 	for (off = 0; off < ksize; off += psize)
1330 		map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1331 
1332 	/*
1333 	 * The kernel will need a 1 page window to work with page tables
1334 	 */
1335 	bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE);
1336 	DBG(bi->bi_pt_window);
1337 	bi->bi_pte_to_pt_window =
1338 	    (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1339 	DBG(bi->bi_pte_to_pt_window);
1340 
1341 #if defined(__xpv)
1342 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1343 		/* If this is a domU we're done. */
1344 		DBG_MSG("\nPage tables constructed\n");
1345 		return;
1346 	}
1347 #endif /* __xpv */
1348 
1349 	/*
1350 	 * We need 1:1 mappings for the lower 1M of memory to access
1351 	 * BIOS tables used by a couple of drivers during boot.
1352 	 *
1353 	 * The following code works because our simple memory allocator
1354 	 * only grows usage in an upwards direction.
1355 	 *
1356 	 * Note that by this point in boot some mappings for low memory
1357 	 * may already exist because we've already accessed device in low
1358 	 * memory.  (Specifically the video frame buffer and keyboard
1359 	 * status ports.)  If we're booting on raw hardware then GRUB
1360 	 * created these mappings for us.  If we're booting under a
1361 	 * hypervisor then we went ahead and remapped these devices into
1362 	 * memory allocated within dboot itself.
1363 	 */
1364 	if (map_debug)
1365 		dboot_printf("1:1 map pa=0..1Meg\n");
1366 	for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
1367 #if defined(__xpv)
1368 		map_ma_at_va(start, start, 0);
1369 #else /* __xpv */
1370 		map_pa_at_va(start, start, 0);
1371 #endif /* __xpv */
1372 	}
1373 
1374 #if !defined(__xpv)
1375 	for (i = 0; i < memlists_used; ++i) {
1376 		start = memlists[i].addr;
1377 
1378 		end = start + memlists[i].size;
1379 
1380 		if (map_debug)
1381 			dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
1382 			    start, end);
1383 		while (start < end && start < next_avail_addr) {
1384 			map_pa_at_va(start, start, 0);
1385 			start += MMU_PAGESIZE;
1386 		}
1387 	}
1388 #endif /* !__xpv */
1389 
1390 	DBG_MSG("\nPage tables constructed\n");
1391 }
1392 
1393 #define	NO_MULTIBOOT	\
1394 "multiboot is no longer used to boot the Solaris Operating System.\n\
1395 The grub entry should be changed to:\n\
1396 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
1397 module$ /platform/i86pc/$ISADIR/boot_archive\n\
1398 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
1399 
1400 /*
1401  * startup_kernel has a pretty simple job. It builds pagetables which reflect
1402  * 1:1 mappings for all memory in use. It then also adds mappings for
1403  * the kernel nucleus at virtual address of target_kernel_text using large page
1404  * mappings. The page table pages are also accessible at 1:1 mapped
1405  * virtual addresses.
1406  */
1407 /*ARGSUSED*/
1408 void
1409 startup_kernel(void)
1410 {
1411 	char *cmdline;
1412 	uintptr_t addr;
1413 #if defined(__xpv)
1414 	physdev_set_iopl_t set_iopl;
1415 #endif /* __xpv */
1416 
1417 	/*
1418 	 * At this point we are executing in a 32 bit real mode.
1419 	 */
1420 #if defined(__xpv)
1421 	cmdline = (char *)xen_info->cmd_line;
1422 #else /* __xpv */
1423 	cmdline = (char *)mb_info->cmdline;
1424 #endif /* __xpv */
1425 
1426 	prom_debug = (strstr(cmdline, "prom_debug") != NULL);
1427 	map_debug = (strstr(cmdline, "map_debug") != NULL);
1428 
1429 #if defined(__xpv)
1430 	/*
1431 	 * For dom0, before we initialize the console subsystem we'll
1432 	 * need to enable io operations, so set I/O priveldge level to 1.
1433 	 */
1434 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1435 		set_iopl.iopl = 1;
1436 		(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1437 	}
1438 #endif /* __xpv */
1439 
1440 	bcons_init(cmdline);
1441 	DBG_MSG("\n\nSolaris prekernel set: ");
1442 	DBG_MSG(cmdline);
1443 	DBG_MSG("\n");
1444 
1445 	if (strstr(cmdline, "multiboot") != NULL) {
1446 		dboot_panic(NO_MULTIBOOT);
1447 	}
1448 
1449 	/*
1450 	 * boot info must be 16 byte aligned for 64 bit kernel ABI
1451 	 */
1452 	addr = (uintptr_t)boot_info;
1453 	addr = (addr + 0xf) & ~0xf;
1454 	bi = (struct xboot_info *)addr;
1455 	DBG((uintptr_t)bi);
1456 	bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
1457 
1458 	/*
1459 	 * Need correct target_kernel_text value
1460 	 */
1461 #if defined(_BOOT_TARGET_amd64)
1462 	target_kernel_text = KERNEL_TEXT_amd64;
1463 #elif defined(__xpv)
1464 	target_kernel_text = KERNEL_TEXT_i386_xpv;
1465 #else
1466 	target_kernel_text = KERNEL_TEXT_i386;
1467 #endif
1468 	DBG(target_kernel_text);
1469 
1470 #if defined(__xpv)
1471 
1472 	/*
1473 	 * XXPV	Derive this stuff from CPUID / what the hypervisor has enabled
1474 	 */
1475 
1476 #if defined(_BOOT_TARGET_amd64)
1477 	/*
1478 	 * 64-bit hypervisor.
1479 	 */
1480 	amd64_support = 1;
1481 	pae_support = 1;
1482 
1483 #else	/* _BOOT_TARGET_amd64 */
1484 
1485 	/*
1486 	 * See if we are running on a PAE Hypervisor
1487 	 */
1488 	{
1489 		xen_capabilities_info_t caps;
1490 
1491 		if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
1492 			dboot_panic("HYPERVISOR_xen_version(caps) failed");
1493 		caps[sizeof (caps) - 1] = 0;
1494 		if (prom_debug)
1495 			dboot_printf("xen capabilities %s\n", caps);
1496 		if (strstr(caps, "x86_32p") != NULL)
1497 			pae_support = 1;
1498 	}
1499 
1500 #endif	/* _BOOT_TARGET_amd64 */
1501 	{
1502 		xen_platform_parameters_t p;
1503 
1504 		if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
1505 			dboot_panic("HYPERVISOR_xen_version(parms) failed");
1506 		DBG(p.virt_start);
1507 		mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
1508 	}
1509 
1510 	/*
1511 	 * The hypervisor loads stuff starting at 1Gig
1512 	 */
1513 	mfn_base = ONE_GIG;
1514 	DBG(mfn_base);
1515 
1516 	/*
1517 	 * enable writable page table mode for the hypervisor
1518 	 */
1519 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1520 	    VMASST_TYPE_writable_pagetables) < 0)
1521 		dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
1522 
1523 	/*
1524 	 * check for NX support
1525 	 */
1526 	if (pae_support) {
1527 		uint32_t eax = 0x80000000;
1528 		uint32_t edx = get_cpuid_edx(&eax);
1529 
1530 		if (eax >= 0x80000001) {
1531 			eax = 0x80000001;
1532 			edx = get_cpuid_edx(&eax);
1533 			if (edx & CPUID_AMD_EDX_NX)
1534 				NX_support = 1;
1535 		}
1536 	}
1537 
1538 #if !defined(_BOOT_TARGET_amd64)
1539 
1540 	/*
1541 	 * The 32-bit hypervisor uses segmentation to protect itself from
1542 	 * guests. This means when a guest attempts to install a flat 4GB
1543 	 * code or data descriptor the 32-bit hypervisor will protect itself
1544 	 * by silently shrinking the segment such that if the guest attempts
1545 	 * any access where the hypervisor lives a #gp fault is generated.
1546 	 * The problem is that some applications expect a full 4GB flat
1547 	 * segment for their current thread pointer and will use negative
1548 	 * offset segment wrap around to access data. TLS support in linux
1549 	 * brand is one example of this.
1550 	 *
1551 	 * The 32-bit hypervisor can catch the #gp fault in these cases
1552 	 * and emulate the access without passing the #gp fault to the guest
1553 	 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
1554 	 * Seems like this should have been the default.
1555 	 * Either way, we want the hypervisor -- and not Solaris -- to deal
1556 	 * to deal with emulating these accesses.
1557 	 */
1558 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1559 	    VMASST_TYPE_4gb_segments) < 0)
1560 		dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
1561 #endif	/* !_BOOT_TARGET_amd64 */
1562 
1563 #else	/* __xpv */
1564 
1565 	/*
1566 	 * use cpuid to enable MMU features
1567 	 */
1568 	if (have_cpuid()) {
1569 		uint32_t eax, edx;
1570 
1571 		eax = 1;
1572 		edx = get_cpuid_edx(&eax);
1573 		if (edx & CPUID_INTC_EDX_PSE)
1574 			largepage_support = 1;
1575 		if (edx & CPUID_INTC_EDX_PGE)
1576 			pge_support = 1;
1577 		if (edx & CPUID_INTC_EDX_PAE)
1578 			pae_support = 1;
1579 
1580 		eax = 0x80000000;
1581 		edx = get_cpuid_edx(&eax);
1582 		if (eax >= 0x80000001) {
1583 			eax = 0x80000001;
1584 			edx = get_cpuid_edx(&eax);
1585 			if (edx & CPUID_AMD_EDX_LM)
1586 				amd64_support = 1;
1587 			if (edx & CPUID_AMD_EDX_NX)
1588 				NX_support = 1;
1589 		}
1590 	} else {
1591 		dboot_printf("cpuid not supported\n");
1592 	}
1593 #endif /* __xpv */
1594 
1595 
1596 #if defined(_BOOT_TARGET_amd64)
1597 	if (amd64_support == 0)
1598 		dboot_panic("long mode not supported, rebooting");
1599 	else if (pae_support == 0)
1600 		dboot_panic("long mode, but no PAE; rebooting");
1601 #else
1602 	/*
1603 	 * Allow the command line to over-ride use of PAE for 32 bit.
1604 	 */
1605 	if (strstr(cmdline, "disablePAE=true") != NULL) {
1606 		pae_support = 0;
1607 		NX_support = 0;
1608 		amd64_support = 0;
1609 	}
1610 #endif
1611 
1612 	/*
1613 	 * initialize the simple memory allocator
1614 	 */
1615 	init_mem_alloc();
1616 
1617 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
1618 	/*
1619 	 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
1620 	 */
1621 	if (max_mem < FOUR_GIG && NX_support == 0)
1622 		pae_support = 0;
1623 #endif
1624 
1625 	/*
1626 	 * configure mmu information
1627 	 */
1628 	if (pae_support) {
1629 		shift_amt = shift_amt_pae;
1630 		ptes_per_table = 512;
1631 		pte_size = 8;
1632 		lpagesize = TWO_MEG;
1633 #if defined(_BOOT_TARGET_amd64)
1634 		top_level = 3;
1635 #else
1636 		top_level = 2;
1637 #endif
1638 	} else {
1639 		pae_support = 0;
1640 		NX_support = 0;
1641 		shift_amt = shift_amt_nopae;
1642 		ptes_per_table = 1024;
1643 		pte_size = 4;
1644 		lpagesize = FOUR_MEG;
1645 		top_level = 1;
1646 	}
1647 
1648 	DBG(pge_support);
1649 	DBG(NX_support);
1650 	DBG(largepage_support);
1651 	DBG(amd64_support);
1652 	DBG(top_level);
1653 	DBG(pte_size);
1654 	DBG(ptes_per_table);
1655 	DBG(lpagesize);
1656 
1657 #if defined(__xpv)
1658 	ktext_phys = ONE_GIG;		/* from UNIX Mapfile */
1659 #else
1660 	ktext_phys = FOUR_MEG;		/* from UNIX Mapfile */
1661 #endif
1662 
1663 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
1664 	/*
1665 	 * For grub, copy kernel bits from the ELF64 file to final place.
1666 	 */
1667 	DBG_MSG("\nAllocating nucleus pages.\n");
1668 	ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
1669 	if (ktext_phys == 0)
1670 		dboot_panic("failed to allocate aligned kernel memory");
1671 	if (dboot_elfload64(mb_header.load_addr) != 0)
1672 		dboot_panic("failed to parse kernel ELF image, rebooting");
1673 #endif
1674 
1675 	DBG(ktext_phys);
1676 
1677 	/*
1678 	 * Allocate page tables.
1679 	 */
1680 	build_page_tables();
1681 
1682 	/*
1683 	 * return to assembly code to switch to running kernel
1684 	 */
1685 	entry_addr_low = (uint32_t)target_kernel_text;
1686 	DBG(entry_addr_low);
1687 	bi->bi_use_largepage = largepage_support;
1688 	bi->bi_use_pae = pae_support;
1689 	bi->bi_use_pge = pge_support;
1690 	bi->bi_use_nx = NX_support;
1691 
1692 #if defined(__xpv)
1693 
1694 	bi->bi_next_paddr = next_avail_addr - mfn_base;
1695 	DBG(bi->bi_next_paddr);
1696 	bi->bi_next_vaddr = (native_ptr_t)next_avail_addr;
1697 	DBG(bi->bi_next_vaddr);
1698 
1699 	/*
1700 	 * unmap unused pages in start area to make them available for DMA
1701 	 */
1702 	while (next_avail_addr < scratch_end) {
1703 		(void) HYPERVISOR_update_va_mapping(next_avail_addr,
1704 		    0, UVMF_INVLPG | UVMF_LOCAL);
1705 		next_avail_addr += MMU_PAGESIZE;
1706 	}
1707 
1708 	bi->bi_xen_start_info = (uintptr_t)xen_info;
1709 	DBG((uintptr_t)HYPERVISOR_shared_info);
1710 	bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
1711 	bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
1712 
1713 #else /* __xpv */
1714 
1715 	bi->bi_next_paddr = next_avail_addr;
1716 	DBG(bi->bi_next_paddr);
1717 	bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
1718 	DBG(bi->bi_next_vaddr);
1719 	bi->bi_mb_info = (uintptr_t)mb_info;
1720 	bi->bi_top_page_table = (uintptr_t)top_page_table;
1721 
1722 #endif /* __xpv */
1723 
1724 	bi->bi_kseg_size = FOUR_MEG;
1725 	DBG(bi->bi_kseg_size);
1726 
1727 #ifndef __xpv
1728 	if (map_debug)
1729 		dump_tables();
1730 #endif
1731 
1732 	DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
1733 }
1734