1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 *
26 * Copyright 2013 Joyent, Inc. All rights reserved.
27 */
28
29
30 #include <sys/types.h>
31 #include <sys/machparam.h>
32 #include <sys/x86_archext.h>
33 #include <sys/systm.h>
34 #include <sys/mach_mmu.h>
35 #include <sys/multiboot.h>
36 #include <sys/multiboot2.h>
37 #include <sys/multiboot2_impl.h>
38 #include <sys/sysmacros.h>
39 #include <sys/sha1.h>
40 #include <util/string.h>
41 #include <util/strtolctype.h>
42
43 #if defined(__xpv)
44
45 #include <sys/hypervisor.h>
46 uintptr_t xen_virt_start;
47 pfn_t *mfn_to_pfn_mapping;
48
49 #else /* !__xpv */
50
51 extern multiboot_header_t mb_header;
52 extern uint32_t mb2_load_addr;
53 extern int have_cpuid(void);
54
55 #endif /* !__xpv */
56
57 #include <sys/inttypes.h>
58 #include <sys/bootinfo.h>
59 #include <sys/mach_mmu.h>
60 #include <sys/boot_console.h>
61
62 #include "dboot_asm.h"
63 #include "dboot_printf.h"
64 #include "dboot_xboot.h"
65 #include "dboot_elfload.h"
66
67 #define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2)
68
69 /*
70 * This file contains code that runs to transition us from either a multiboot
71 * compliant loader (32 bit non-paging) or a XPV domain loader to
72 * regular kernel execution. Its task is to setup the kernel memory image
73 * and page tables.
74 *
75 * The code executes as:
76 * - 32 bits under GRUB (for 32 or 64 bit Solaris)
77 * - a 32 bit program for the 32-bit PV hypervisor
78 * - a 64 bit program for the 64-bit PV hypervisor (at least for now)
79 *
80 * Under the PV hypervisor, we must create mappings for any memory beyond the
81 * initial start of day allocation (such as the kernel itself).
82 *
83 * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
84 * Since we are running in real mode, so all such memory is accessible.
85 */
86
87 /*
88 * Standard bits used in PTE (page level) and PTP (internal levels)
89 */
90 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
91 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
92
93 /*
94 * This is the target addresses (physical) where the kernel text and data
95 * nucleus pages will be unpacked. On the hypervisor this is actually a
96 * virtual address.
97 */
98 paddr_t ktext_phys;
99 uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */
100
101 static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */
102
103 /*
104 * The stack is setup in assembler before entering startup_kernel()
105 */
106 char stack_space[STACK_SIZE];
107
108 /*
109 * Used to track physical memory allocation
110 */
111 static paddr_t next_avail_addr = 0;
112
113 #if defined(__xpv)
114 /*
115 * Additional information needed for hypervisor memory allocation.
116 * Only memory up to scratch_end is mapped by page tables.
117 * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
118 * to derive a pfn from a pointer, you subtract mfn_base.
119 */
120
121 static paddr_t scratch_end = 0; /* we can't write all of mem here */
122 static paddr_t mfn_base; /* addr corresponding to mfn_list[0] */
123 start_info_t *xen_info;
124
125 #else /* __xpv */
126
127 /*
128 * If on the metal, then we have a multiboot loader.
129 */
130 uint32_t mb_magic; /* magic from boot loader */
131 uint32_t mb_addr; /* multiboot info package from loader */
132 int multiboot_version;
133 multiboot_info_t *mb_info;
134 multiboot2_info_header_t *mb2_info;
135 multiboot_tag_mmap_t *mb2_mmap_tagp;
136 int num_entries; /* mmap entry count */
137 boolean_t num_entries_set; /* is mmap entry count set */
138 uintptr_t load_addr;
139
140 #endif /* __xpv */
141
142 /*
143 * This contains information passed to the kernel
144 */
145 struct xboot_info boot_info[2]; /* extra space to fix alignement for amd64 */
146 struct xboot_info *bi;
147
148 /*
149 * Page table and memory stuff.
150 */
151 static paddr_t max_mem; /* maximum memory address */
152
153 /*
154 * Information about processor MMU
155 */
156 int amd64_support = 0;
157 int largepage_support = 0;
158 int pae_support = 0;
159 int pge_support = 0;
160 int NX_support = 0;
161
162 /*
163 * Low 32 bits of kernel entry address passed back to assembler.
164 * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
165 */
166 uint32_t entry_addr_low;
167
168 /*
169 * Memlists for the kernel. We shouldn't need a lot of these.
170 */
171 #define MAX_MEMLIST (50)
172 struct boot_memlist memlists[MAX_MEMLIST];
173 uint_t memlists_used = 0;
174 struct boot_memlist pcimemlists[MAX_MEMLIST];
175 uint_t pcimemlists_used = 0;
176 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
177 uint_t rsvdmemlists_used = 0;
178
179 /*
180 * This should match what's in the bootloader. It's arbitrary, but GRUB
181 * in particular has limitations on how much space it can use before it
182 * stops working properly. This should be enough.
183 */
184 struct boot_modules modules[MAX_BOOT_MODULES];
185 uint_t modules_used = 0;
186
187 #ifdef __xpv
188 /*
189 * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
190 * definition in Xen source.
191 */
192 typedef struct {
193 uint32_t base_addr_low;
194 uint32_t base_addr_high;
195 uint32_t length_low;
196 uint32_t length_high;
197 uint32_t type;
198 } mmap_t;
199
200 /*
201 * There is 512KB of scratch area after the boot stack page.
202 * We'll use that for everything except the kernel nucleus pages which are too
203 * big to fit there and are allocated last anyway.
204 */
205 #define MAXMAPS 100
206 static mmap_t map_buffer[MAXMAPS];
207 #else
208 typedef mb_memory_map_t mmap_t;
209 #endif
210
211 /*
212 * Debugging macros
213 */
214 uint_t prom_debug = 0;
215 uint_t map_debug = 0;
216
217 static char noname[2] = "-";
218
219 /*
220 * Either hypervisor-specific or grub-specific code builds the initial
221 * memlists. This code does the sort/merge/link for final use.
222 */
223 static void
sort_physinstall(void)224 sort_physinstall(void)
225 {
226 int i;
227 #if !defined(__xpv)
228 int j;
229 struct boot_memlist tmp;
230
231 /*
232 * Now sort the memlists, in case they weren't in order.
233 * Yeah, this is a bubble sort; small, simple and easy to get right.
234 */
235 DBG_MSG("Sorting phys-installed list\n");
236 for (j = memlists_used - 1; j > 0; --j) {
237 for (i = 0; i < j; ++i) {
238 if (memlists[i].addr < memlists[i + 1].addr)
239 continue;
240 tmp = memlists[i];
241 memlists[i] = memlists[i + 1];
242 memlists[i + 1] = tmp;
243 }
244 }
245
246 /*
247 * Merge any memlists that don't have holes between them.
248 */
249 for (i = 0; i <= memlists_used - 1; ++i) {
250 if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
251 continue;
252
253 if (prom_debug)
254 dboot_printf(
255 "merging mem segs %" PRIx64 "...%" PRIx64
256 " w/ %" PRIx64 "...%" PRIx64 "\n",
257 memlists[i].addr,
258 memlists[i].addr + memlists[i].size,
259 memlists[i + 1].addr,
260 memlists[i + 1].addr + memlists[i + 1].size);
261
262 memlists[i].size += memlists[i + 1].size;
263 for (j = i + 1; j < memlists_used - 1; ++j)
264 memlists[j] = memlists[j + 1];
265 --memlists_used;
266 DBG(memlists_used);
267 --i; /* after merging we need to reexamine, so do this */
268 }
269 #endif /* __xpv */
270
271 if (prom_debug) {
272 dboot_printf("\nFinal memlists:\n");
273 for (i = 0; i < memlists_used; ++i) {
274 dboot_printf("\t%d: addr=%" PRIx64 " size=%"
275 PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
276 }
277 }
278
279 /*
280 * link together the memlists with native size pointers
281 */
282 memlists[0].next = 0;
283 memlists[0].prev = 0;
284 for (i = 1; i < memlists_used; ++i) {
285 memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
286 memlists[i].next = 0;
287 memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
288 }
289 bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
290 DBG(bi->bi_phys_install);
291 }
292
293 /*
294 * build bios reserved memlists
295 */
296 static void
build_rsvdmemlists(void)297 build_rsvdmemlists(void)
298 {
299 int i;
300
301 rsvdmemlists[0].next = 0;
302 rsvdmemlists[0].prev = 0;
303 for (i = 1; i < rsvdmemlists_used; ++i) {
304 rsvdmemlists[i].prev =
305 (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
306 rsvdmemlists[i].next = 0;
307 rsvdmemlists[i - 1].next =
308 (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
309 }
310 bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
311 DBG(bi->bi_rsvdmem);
312 }
313
314 #if defined(__xpv)
315
316 /*
317 * halt on the hypervisor after a delay to drain console output
318 */
319 void
dboot_halt(void)320 dboot_halt(void)
321 {
322 uint_t i = 10000;
323
324 while (--i)
325 (void) HYPERVISOR_yield();
326 (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
327 }
328
329 /*
330 * From a machine address, find the corresponding pseudo-physical address.
331 * Pseudo-physical address are contiguous and run from mfn_base in each VM.
332 * Machine addresses are the real underlying hardware addresses.
333 * These are needed for page table entries. Note that this routine is
334 * poorly protected. A bad value of "ma" will cause a page fault.
335 */
336 paddr_t
ma_to_pa(maddr_t ma)337 ma_to_pa(maddr_t ma)
338 {
339 ulong_t pgoff = ma & MMU_PAGEOFFSET;
340 ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
341 paddr_t pa;
342
343 if (pfn >= xen_info->nr_pages)
344 return (-(paddr_t)1);
345 pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
346 #ifdef DEBUG
347 if (ma != pa_to_ma(pa))
348 dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
349 "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
350 #endif
351 return (pa);
352 }
353
354 /*
355 * From a pseudo-physical address, find the corresponding machine address.
356 */
357 maddr_t
pa_to_ma(paddr_t pa)358 pa_to_ma(paddr_t pa)
359 {
360 pfn_t pfn;
361 ulong_t mfn;
362
363 pfn = mmu_btop(pa - mfn_base);
364 if (pa < mfn_base || pfn >= xen_info->nr_pages)
365 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
366 mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
367 #ifdef DEBUG
368 if (mfn_to_pfn_mapping[mfn] != pfn)
369 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
370 pfn, mfn, mfn_to_pfn_mapping[mfn]);
371 #endif
372 return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
373 }
374
375 #endif /* __xpv */
376
377 x86pte_t
get_pteval(paddr_t table,uint_t index)378 get_pteval(paddr_t table, uint_t index)
379 {
380 if (pae_support)
381 return (((x86pte_t *)(uintptr_t)table)[index]);
382 return (((x86pte32_t *)(uintptr_t)table)[index]);
383 }
384
385 /*ARGSUSED*/
386 void
set_pteval(paddr_t table,uint_t index,uint_t level,x86pte_t pteval)387 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
388 {
389 #ifdef __xpv
390 mmu_update_t t;
391 maddr_t mtable = pa_to_ma(table);
392 int retcnt;
393
394 t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
395 t.val = pteval;
396 if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
397 dboot_panic("HYPERVISOR_mmu_update() failed");
398 #else /* __xpv */
399 uintptr_t tab_addr = (uintptr_t)table;
400
401 if (pae_support)
402 ((x86pte_t *)tab_addr)[index] = pteval;
403 else
404 ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
405 if (level == top_level && level == 2)
406 reload_cr3();
407 #endif /* __xpv */
408 }
409
410 paddr_t
make_ptable(x86pte_t * pteval,uint_t level)411 make_ptable(x86pte_t *pteval, uint_t level)
412 {
413 paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
414
415 if (level == top_level && level == 2)
416 *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
417 else
418 *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
419
420 #ifdef __xpv
421 /* Remove write permission to the new page table. */
422 if (HYPERVISOR_update_va_mapping(new_table,
423 *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
424 dboot_panic("HYP_update_va_mapping error");
425 #endif
426
427 if (map_debug)
428 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
429 PRIx64 "\n", level, (ulong_t)new_table, *pteval);
430 return (new_table);
431 }
432
433 x86pte_t *
map_pte(paddr_t table,uint_t index)434 map_pte(paddr_t table, uint_t index)
435 {
436 return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
437 }
438
439 /*
440 * dump out the contents of page tables...
441 */
442 static void
dump_tables(void)443 dump_tables(void)
444 {
445 uint_t save_index[4]; /* for recursion */
446 char *save_table[4]; /* for recursion */
447 uint_t l;
448 uint64_t va;
449 uint64_t pgsize;
450 int index;
451 int i;
452 x86pte_t pteval;
453 char *table;
454 static char *tablist = "\t\t\t";
455 char *tabs = tablist + 3 - top_level;
456 uint_t pa, pa1;
457 #if !defined(__xpv)
458 #define maddr_t paddr_t
459 #endif /* !__xpv */
460
461 dboot_printf("Finished pagetables:\n");
462 table = (char *)(uintptr_t)top_page_table;
463 l = top_level;
464 va = 0;
465 for (index = 0; index < ptes_per_table; ++index) {
466 pgsize = 1ull << shift_amt[l];
467 if (pae_support)
468 pteval = ((x86pte_t *)table)[index];
469 else
470 pteval = ((x86pte32_t *)table)[index];
471 if (pteval == 0)
472 goto next_entry;
473
474 dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
475 tabs + l, (void *)table, index, (uint64_t)pteval, va);
476 pa = ma_to_pa(pteval & MMU_PAGEMASK);
477 dboot_printf(" physaddr=%x\n", pa);
478
479 /*
480 * Don't try to walk hypervisor private pagetables
481 */
482 if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
483 save_table[l] = table;
484 save_index[l] = index;
485 --l;
486 index = -1;
487 table = (char *)(uintptr_t)
488 ma_to_pa(pteval & MMU_PAGEMASK);
489 goto recursion;
490 }
491
492 /*
493 * shorten dump for consecutive mappings
494 */
495 for (i = 1; index + i < ptes_per_table; ++i) {
496 if (pae_support)
497 pteval = ((x86pte_t *)table)[index + i];
498 else
499 pteval = ((x86pte32_t *)table)[index + i];
500 if (pteval == 0)
501 break;
502 pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
503 if (pa1 != pa + i * pgsize)
504 break;
505 }
506 if (i > 2) {
507 dboot_printf("%s...\n", tabs + l);
508 va += pgsize * (i - 2);
509 index += i - 2;
510 }
511 next_entry:
512 va += pgsize;
513 if (l == 3 && index == 256) /* VA hole */
514 va = 0xffff800000000000ull;
515 recursion:
516 ;
517 }
518 if (l < top_level) {
519 ++l;
520 index = save_index[l];
521 table = save_table[l];
522 goto recursion;
523 }
524 }
525
526 /*
527 * Add a mapping for the machine page at the given virtual address.
528 */
529 static void
map_ma_at_va(maddr_t ma,native_ptr_t va,uint_t level)530 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
531 {
532 x86pte_t *ptep;
533 x86pte_t pteval;
534
535 pteval = ma | pte_bits;
536 if (level > 0)
537 pteval |= PT_PAGESIZE;
538 if (va >= target_kernel_text && pge_support)
539 pteval |= PT_GLOBAL;
540
541 if (map_debug && ma != va)
542 dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
543 " pte=0x%" PRIx64 " l=%d\n",
544 (uint64_t)ma, (uint64_t)va, pteval, level);
545
546 #if defined(__xpv)
547 /*
548 * see if we can avoid find_pte() on the hypervisor
549 */
550 if (HYPERVISOR_update_va_mapping(va, pteval,
551 UVMF_INVLPG | UVMF_LOCAL) == 0)
552 return;
553 #endif
554
555 /*
556 * Find the pte that will map this address. This creates any
557 * missing intermediate level page tables
558 */
559 ptep = find_pte(va, NULL, level, 0);
560
561 /*
562 * When paravirtualized, we must use hypervisor calls to modify the
563 * PTE, since paging is active. On real hardware we just write to
564 * the pagetables which aren't in use yet.
565 */
566 #if defined(__xpv)
567 ptep = ptep; /* shut lint up */
568 if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
569 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
570 " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
571 (uint64_t)va, level, (uint64_t)ma, pteval);
572 #else
573 if (va < 1024 * 1024)
574 pteval |= PT_NOCACHE; /* for video RAM */
575 if (pae_support)
576 *ptep = pteval;
577 else
578 *((x86pte32_t *)ptep) = (x86pte32_t)pteval;
579 #endif
580 }
581
582 /*
583 * Add a mapping for the physical page at the given virtual address.
584 */
585 static void
map_pa_at_va(paddr_t pa,native_ptr_t va,uint_t level)586 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
587 {
588 map_ma_at_va(pa_to_ma(pa), va, level);
589 }
590
591 /*
592 * This is called to remove start..end from the
593 * possible range of PCI addresses.
594 */
595 const uint64_t pci_lo_limit = 0x00100000ul;
596 const uint64_t pci_hi_limit = 0xfff00000ul;
597 static void
exclude_from_pci(uint64_t start,uint64_t end)598 exclude_from_pci(uint64_t start, uint64_t end)
599 {
600 int i;
601 int j;
602 struct boot_memlist *ml;
603
604 for (i = 0; i < pcimemlists_used; ++i) {
605 ml = &pcimemlists[i];
606
607 /* delete the entire range? */
608 if (start <= ml->addr && ml->addr + ml->size <= end) {
609 --pcimemlists_used;
610 for (j = i; j < pcimemlists_used; ++j)
611 pcimemlists[j] = pcimemlists[j + 1];
612 --i; /* to revisit the new one at this index */
613 }
614
615 /* split a range? */
616 else if (ml->addr < start && end < ml->addr + ml->size) {
617
618 ++pcimemlists_used;
619 if (pcimemlists_used > MAX_MEMLIST)
620 dboot_panic("too many pcimemlists");
621
622 for (j = pcimemlists_used - 1; j > i; --j)
623 pcimemlists[j] = pcimemlists[j - 1];
624 ml->size = start - ml->addr;
625
626 ++ml;
627 ml->size = (ml->addr + ml->size) - end;
628 ml->addr = end;
629 ++i; /* skip on to next one */
630 }
631
632 /* cut memory off the start? */
633 else if (ml->addr < end && end < ml->addr + ml->size) {
634 ml->size -= end - ml->addr;
635 ml->addr = end;
636 }
637
638 /* cut memory off the end? */
639 else if (ml->addr <= start && start < ml->addr + ml->size) {
640 ml->size = start - ml->addr;
641 }
642 }
643 }
644
645 /*
646 * During memory allocation, find the highest address not used yet.
647 */
648 static void
check_higher(paddr_t a)649 check_higher(paddr_t a)
650 {
651 if (a < next_avail_addr)
652 return;
653 next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
654 DBG(next_avail_addr);
655 }
656
657 static int
dboot_loader_mmap_entries(void)658 dboot_loader_mmap_entries(void)
659 {
660 #if !defined(__xpv)
661 if (num_entries_set == B_TRUE)
662 return (num_entries);
663
664 switch (multiboot_version) {
665 case 1:
666 DBG(mb_info->flags);
667 if (mb_info->flags & 0x40) {
668 mb_memory_map_t *mmap;
669
670 DBG(mb_info->mmap_addr);
671 DBG(mb_info->mmap_length);
672 check_higher(mb_info->mmap_addr + mb_info->mmap_length);
673
674 for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
675 (uint32_t)mmap < mb_info->mmap_addr +
676 mb_info->mmap_length;
677 mmap = (mb_memory_map_t *)((uint32_t)mmap +
678 mmap->size + sizeof (mmap->size)))
679 ++num_entries;
680
681 num_entries_set = B_TRUE;
682 }
683 break;
684 case 2:
685 num_entries_set = B_TRUE;
686 num_entries = dboot_multiboot2_mmap_nentries(mb2_info,
687 mb2_mmap_tagp);
688 break;
689 default:
690 dboot_panic("Unknown multiboot version: %d\n",
691 multiboot_version);
692 break;
693 }
694 return (num_entries);
695 #else
696 return (MAXMAPS);
697 #endif
698 }
699
700 static uint32_t
dboot_loader_mmap_get_type(int index)701 dboot_loader_mmap_get_type(int index)
702 {
703 #if !defined(__xpv)
704 mb_memory_map_t *mp, *mpend;
705 int i;
706
707 switch (multiboot_version) {
708 case 1:
709 mp = (mb_memory_map_t *)mb_info->mmap_addr;
710 mpend = (mb_memory_map_t *)
711 (mb_info->mmap_addr + mb_info->mmap_length);
712
713 for (i = 0; mp < mpend && i != index; i++)
714 mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
715 sizeof (mp->size));
716 if (mp >= mpend) {
717 dboot_panic("dboot_loader_mmap_get_type(): index "
718 "out of bounds: %d\n", index);
719 }
720 return (mp->type);
721
722 case 2:
723 return (dboot_multiboot2_mmap_get_type(mb2_info,
724 mb2_mmap_tagp, index));
725
726 default:
727 dboot_panic("Unknown multiboot version: %d\n",
728 multiboot_version);
729 break;
730 }
731 return (0);
732 #else
733 return (map_buffer[index].type);
734 #endif
735 }
736
737 static uint64_t
dboot_loader_mmap_get_base(int index)738 dboot_loader_mmap_get_base(int index)
739 {
740 #if !defined(__xpv)
741 mb_memory_map_t *mp, *mpend;
742 int i;
743
744 switch (multiboot_version) {
745 case 1:
746 mp = (mb_memory_map_t *)mb_info->mmap_addr;
747 mpend = (mb_memory_map_t *)
748 (mb_info->mmap_addr + mb_info->mmap_length);
749
750 for (i = 0; mp < mpend && i != index; i++)
751 mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
752 sizeof (mp->size));
753 if (mp >= mpend) {
754 dboot_panic("dboot_loader_mmap_get_base(): index "
755 "out of bounds: %d\n", index);
756 }
757 return (((uint64_t)mp->base_addr_high << 32) +
758 (uint64_t)mp->base_addr_low);
759
760 case 2:
761 return (dboot_multiboot2_mmap_get_base(mb2_info,
762 mb2_mmap_tagp, index));
763
764 default:
765 dboot_panic("Unknown multiboot version: %d\n",
766 multiboot_version);
767 break;
768 }
769 return (0);
770 #else
771 return (((uint64_t)map_buffer[index].base_addr_high << 32) +
772 (uint64_t)map_buffer[index].base_addr_low);
773 #endif
774 }
775
776 static uint64_t
dboot_loader_mmap_get_length(int index)777 dboot_loader_mmap_get_length(int index)
778 {
779 #if !defined(__xpv)
780 mb_memory_map_t *mp, *mpend;
781 int i;
782
783 switch (multiboot_version) {
784 case 1:
785 mp = (mb_memory_map_t *)mb_info->mmap_addr;
786 mpend = (mb_memory_map_t *)
787 (mb_info->mmap_addr + mb_info->mmap_length);
788
789 for (i = 0; mp < mpend && i != index; i++)
790 mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
791 sizeof (mp->size));
792 if (mp >= mpend) {
793 dboot_panic("dboot_loader_mmap_get_length(): index "
794 "out of bounds: %d\n", index);
795 }
796 return (((uint64_t)mp->length_high << 32) +
797 (uint64_t)mp->length_low);
798
799 case 2:
800 return (dboot_multiboot2_mmap_get_length(mb2_info,
801 mb2_mmap_tagp, index));
802
803 default:
804 dboot_panic("Unknown multiboot version: %d\n",
805 multiboot_version);
806 break;
807 }
808 return (0);
809 #else
810 return (((uint64_t)map_buffer[index].length_high << 32) +
811 (uint64_t)map_buffer[index].length_low);
812 #endif
813 }
814
815 static void
build_pcimemlists(void)816 build_pcimemlists(void)
817 {
818 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
819 uint64_t start;
820 uint64_t end;
821 int i, num;
822
823 /*
824 * initialize
825 */
826 pcimemlists[0].addr = pci_lo_limit;
827 pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
828 pcimemlists_used = 1;
829
830 num = dboot_loader_mmap_entries();
831 /*
832 * Fill in PCI memlists.
833 */
834 for (i = 0; i < num; ++i) {
835 start = dboot_loader_mmap_get_base(i);
836 end = start + dboot_loader_mmap_get_length(i);
837
838 if (prom_debug)
839 dboot_printf("\ttype: %d %" PRIx64 "..%"
840 PRIx64 "\n", dboot_loader_mmap_get_type(i),
841 start, end);
842
843 /*
844 * page align start and end
845 */
846 start = (start + page_offset) & ~page_offset;
847 end &= ~page_offset;
848 if (end <= start)
849 continue;
850
851 exclude_from_pci(start, end);
852 }
853
854 /*
855 * Finish off the pcimemlist
856 */
857 if (prom_debug) {
858 for (i = 0; i < pcimemlists_used; ++i) {
859 dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
860 PRIx64 "\n", pcimemlists[i].addr,
861 pcimemlists[i].addr + pcimemlists[i].size);
862 }
863 }
864 pcimemlists[0].next = 0;
865 pcimemlists[0].prev = 0;
866 for (i = 1; i < pcimemlists_used; ++i) {
867 pcimemlists[i].prev =
868 (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
869 pcimemlists[i].next = 0;
870 pcimemlists[i - 1].next =
871 (native_ptr_t)(uintptr_t)(pcimemlists + i);
872 }
873 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
874 DBG(bi->bi_pcimem);
875 }
876
877 #if defined(__xpv)
878 /*
879 * Initialize memory allocator stuff from hypervisor-supplied start info.
880 */
881 static void
init_mem_alloc(void)882 init_mem_alloc(void)
883 {
884 int local; /* variables needed to find start region */
885 paddr_t scratch_start;
886 xen_memory_map_t map;
887
888 DBG_MSG("Entered init_mem_alloc()\n");
889
890 /*
891 * Free memory follows the stack. There's at least 512KB of scratch
892 * space, rounded up to at least 2Mb alignment. That should be enough
893 * for the page tables we'll need to build. The nucleus memory is
894 * allocated last and will be outside the addressible range. We'll
895 * switch to new page tables before we unpack the kernel
896 */
897 scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
898 DBG(scratch_start);
899 scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
900 DBG(scratch_end);
901
902 /*
903 * For paranoia, leave some space between hypervisor data and ours.
904 * Use 500 instead of 512.
905 */
906 next_avail_addr = scratch_end - 500 * 1024;
907 DBG(next_avail_addr);
908
909 /*
910 * The domain builder gives us at most 1 module
911 */
912 DBG(xen_info->mod_len);
913 if (xen_info->mod_len > 0) {
914 DBG(xen_info->mod_start);
915 modules[0].bm_addr = xen_info->mod_start;
916 modules[0].bm_size = xen_info->mod_len;
917 bi->bi_module_cnt = 1;
918 bi->bi_modules = (native_ptr_t)modules;
919 } else {
920 bi->bi_module_cnt = 0;
921 bi->bi_modules = NULL;
922 }
923 DBG(bi->bi_module_cnt);
924 DBG(bi->bi_modules);
925
926 DBG(xen_info->mfn_list);
927 DBG(xen_info->nr_pages);
928 max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
929 DBG(max_mem);
930
931 /*
932 * Using pseudo-physical addresses, so only 1 memlist element
933 */
934 memlists[0].addr = 0;
935 DBG(memlists[0].addr);
936 memlists[0].size = max_mem;
937 DBG(memlists[0].size);
938 memlists_used = 1;
939 DBG(memlists_used);
940
941 /*
942 * finish building physinstall list
943 */
944 sort_physinstall();
945
946 /*
947 * build bios reserved memlists
948 */
949 build_rsvdmemlists();
950
951 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
952 /*
953 * build PCI Memory list
954 */
955 map.nr_entries = MAXMAPS;
956 /*LINTED: constant in conditional context*/
957 set_xen_guest_handle(map.buffer, map_buffer);
958 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
959 dboot_panic("getting XENMEM_machine_memory_map failed");
960 build_pcimemlists();
961 }
962 }
963
964 #else /* !__xpv */
965
966 /* Stub in this version. */
967 static void
dboot_multiboot1_xboot_consinfo(void)968 dboot_multiboot1_xboot_consinfo(void)
969 {
970 }
971
972 /* Stub in this version. */
973 static void
dboot_multiboot2_xboot_consinfo(void)974 dboot_multiboot2_xboot_consinfo(void)
975 {
976 }
977
978 static int
dboot_multiboot_modcount(void)979 dboot_multiboot_modcount(void)
980 {
981 switch (multiboot_version) {
982 case 1:
983 return (mb_info->mods_count);
984
985 case 2:
986 return (dboot_multiboot2_modcount(mb2_info));
987
988 default:
989 dboot_panic("Unknown multiboot version: %d\n",
990 multiboot_version);
991 break;
992 }
993 return (0);
994 }
995
996 static uint32_t
dboot_multiboot_modstart(int index)997 dboot_multiboot_modstart(int index)
998 {
999 switch (multiboot_version) {
1000 case 1:
1001 return (((mb_module_t *)mb_info->mods_addr)[index].mod_start);
1002
1003 case 2:
1004 return (dboot_multiboot2_modstart(mb2_info, index));
1005
1006 default:
1007 dboot_panic("Unknown multiboot version: %d\n",
1008 multiboot_version);
1009 break;
1010 }
1011 return (0);
1012 }
1013
1014 static uint32_t
dboot_multiboot_modend(int index)1015 dboot_multiboot_modend(int index)
1016 {
1017 switch (multiboot_version) {
1018 case 1:
1019 return (((mb_module_t *)mb_info->mods_addr)[index].mod_end);
1020
1021 case 2:
1022 return (dboot_multiboot2_modend(mb2_info, index));
1023
1024 default:
1025 dboot_panic("Unknown multiboot version: %d\n",
1026 multiboot_version);
1027 break;
1028 }
1029 return (0);
1030 }
1031
1032 static char *
dboot_multiboot_modcmdline(int index)1033 dboot_multiboot_modcmdline(int index)
1034 {
1035 switch (multiboot_version) {
1036 case 1:
1037 return ((char *)((mb_module_t *)
1038 mb_info->mods_addr)[index].mod_name);
1039
1040 case 2:
1041 return (dboot_multiboot2_modcmdline(mb2_info, index));
1042
1043 default:
1044 dboot_panic("Unknown multiboot version: %d\n",
1045 multiboot_version);
1046 break;
1047 }
1048 return (0);
1049 }
1050
1051 static boolean_t
dboot_multiboot_basicmeminfo(uint32_t * lower,uint32_t * upper)1052 dboot_multiboot_basicmeminfo(uint32_t *lower, uint32_t *upper)
1053 {
1054 boolean_t rv = B_FALSE;
1055
1056 switch (multiboot_version) {
1057 case 1:
1058 if (mb_info->flags & 0x01) {
1059 *lower = mb_info->mem_lower;
1060 *upper = mb_info->mem_upper;
1061 rv = B_TRUE;
1062 }
1063 break;
1064
1065 case 2:
1066 return (dboot_multiboot2_basicmeminfo(mb2_info, lower, upper));
1067
1068 default:
1069 dboot_panic("Unknown multiboot version: %d\n",
1070 multiboot_version);
1071 break;
1072 }
1073 return (rv);
1074 }
1075
1076 static uint8_t
dboot_a2h(char v)1077 dboot_a2h(char v)
1078 {
1079 if (v >= 'a')
1080 return (v - 'a' + 0xa);
1081 else if (v >= 'A')
1082 return (v - 'A' + 0xa);
1083 else if (v >= '0')
1084 return (v - '0');
1085 else
1086 dboot_panic("bad ASCII hex character %c\n", v);
1087
1088 return (0);
1089 }
1090
1091 static void
digest_a2h(const char * ascii,uint8_t * digest)1092 digest_a2h(const char *ascii, uint8_t *digest)
1093 {
1094 unsigned int i;
1095
1096 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1097 digest[i] = dboot_a2h(ascii[i * 2]) << 4;
1098 digest[i] |= dboot_a2h(ascii[i * 2 + 1]);
1099 }
1100 }
1101
1102 /*
1103 * Generate a SHA-1 hash of the first len bytes of image, and compare it with
1104 * the ASCII-format hash found in the 40-byte buffer at ascii. If they
1105 * match, return 0, otherwise -1. This works only for images smaller than
1106 * 4 GB, which should not be a problem.
1107 */
1108 static int
check_image_hash(uint_t midx)1109 check_image_hash(uint_t midx)
1110 {
1111 const char *ascii;
1112 const void *image;
1113 size_t len;
1114 SHA1_CTX ctx;
1115 uint8_t digest[SHA1_DIGEST_LENGTH];
1116 uint8_t baseline[SHA1_DIGEST_LENGTH];
1117 unsigned int i;
1118
1119 ascii = (const char *)(uintptr_t)modules[midx].bm_hash;
1120 image = (const void *)(uintptr_t)modules[midx].bm_addr;
1121 len = (size_t)modules[midx].bm_size;
1122
1123 digest_a2h(ascii, baseline);
1124
1125 SHA1Init(&ctx);
1126 SHA1Update(&ctx, image, len);
1127 SHA1Final(digest, &ctx);
1128
1129 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1130 if (digest[i] != baseline[i])
1131 return (-1);
1132 }
1133
1134 return (0);
1135 }
1136
1137 static const char *
type_to_str(boot_module_type_t type)1138 type_to_str(boot_module_type_t type)
1139 {
1140 switch (type) {
1141 case BMT_ROOTFS:
1142 return ("rootfs");
1143 case BMT_FILE:
1144 return ("file");
1145 case BMT_HASH:
1146 return ("hash");
1147 default:
1148 return ("unknown");
1149 }
1150 }
1151
1152 static void
check_images(void)1153 check_images(void)
1154 {
1155 uint_t i;
1156 char displayhash[SHA1_ASCII_LENGTH + 1];
1157
1158 for (i = 0; i < modules_used; i++) {
1159 if (prom_debug) {
1160 dboot_printf("module #%d: name %s type %s "
1161 "addr %lx size %lx\n",
1162 i, (char *)(uintptr_t)modules[i].bm_name,
1163 type_to_str(modules[i].bm_type),
1164 (ulong_t)modules[i].bm_addr,
1165 (ulong_t)modules[i].bm_size);
1166 }
1167
1168 if (modules[i].bm_type == BMT_HASH ||
1169 modules[i].bm_hash == NULL) {
1170 DBG_MSG("module has no hash; skipping check\n");
1171 continue;
1172 }
1173 (void) memcpy(displayhash,
1174 (void *)(uintptr_t)modules[i].bm_hash,
1175 SHA1_ASCII_LENGTH);
1176 displayhash[SHA1_ASCII_LENGTH] = '\0';
1177 if (prom_debug) {
1178 dboot_printf("checking expected hash [%s]: ",
1179 displayhash);
1180 }
1181
1182 if (check_image_hash(i) != 0)
1183 dboot_panic("hash mismatch!\n");
1184 else
1185 DBG_MSG("OK\n");
1186 }
1187 }
1188
1189 /*
1190 * Determine the module's starting address, size, name, and type, and fill the
1191 * boot_modules structure. This structure is used by the bop code, except for
1192 * hashes which are checked prior to transferring control to the kernel.
1193 */
1194 static void
process_module(int midx)1195 process_module(int midx)
1196 {
1197 uint32_t mod_start = dboot_multiboot_modstart(midx);
1198 uint32_t mod_end = dboot_multiboot_modend(midx);
1199 char *cmdline = dboot_multiboot_modcmdline(midx);
1200 char *p, *q;
1201
1202 check_higher(mod_end);
1203 if (prom_debug) {
1204 dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n",
1205 midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end);
1206 }
1207
1208 if (mod_start > mod_end) {
1209 dboot_panic("module #%d: module start address 0x%lx greater "
1210 "than end address 0x%lx", midx,
1211 (ulong_t)mod_start, (ulong_t)mod_end);
1212 }
1213
1214 /*
1215 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
1216 * the address of the last valid byte in a module plus 1 as mod_end.
1217 * This is of course a bug; the multiboot specification simply states
1218 * that mod_start and mod_end "contain the start and end addresses of
1219 * the boot module itself" which is pretty obviously not what GRUB is
1220 * doing. However, fixing it requires that not only this code be
1221 * changed but also that other code consuming this value and values
1222 * derived from it be fixed, and that the kernel and GRUB must either
1223 * both have the bug or neither. While there are a lot of combinations
1224 * that will work, there are also some that won't, so for simplicity
1225 * we'll just cope with the bug. That means we won't actually hash the
1226 * byte at mod_end, and we will expect that mod_end for the hash file
1227 * itself is one greater than some multiple of 41 (40 bytes of ASCII
1228 * hash plus a newline for each module). We set bm_size to the true
1229 * correct number of bytes in each module, achieving exactly this.
1230 */
1231
1232 modules[midx].bm_addr = mod_start;
1233 modules[midx].bm_size = mod_end - mod_start;
1234 modules[midx].bm_name = (native_ptr_t)(uintptr_t)cmdline;
1235 modules[midx].bm_hash = NULL;
1236 modules[midx].bm_type = BMT_FILE;
1237
1238 if (cmdline == NULL) {
1239 modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname;
1240 return;
1241 }
1242
1243 p = cmdline;
1244 modules[midx].bm_name =
1245 (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r");
1246
1247 while (p != NULL) {
1248 q = strsep(&p, " \t\f\n\r");
1249 if (strncmp(q, "name=", 5) == 0) {
1250 if (q[5] != '\0' && !isspace(q[5])) {
1251 modules[midx].bm_name =
1252 (native_ptr_t)(uintptr_t)(q + 5);
1253 }
1254 continue;
1255 }
1256
1257 if (strncmp(q, "type=", 5) == 0) {
1258 if (q[5] == '\0' || isspace(q[5]))
1259 continue;
1260 q += 5;
1261 if (strcmp(q, "rootfs") == 0) {
1262 modules[midx].bm_type = BMT_ROOTFS;
1263 } else if (strcmp(q, "hash") == 0) {
1264 modules[midx].bm_type = BMT_HASH;
1265 } else if (strcmp(q, "file") != 0) {
1266 dboot_printf("\tmodule #%d: unknown module "
1267 "type '%s'; defaulting to 'file'",
1268 midx, q);
1269 }
1270 continue;
1271 }
1272
1273 if (strncmp(q, "hash=", 5) == 0) {
1274 if (q[5] != '\0' && !isspace(q[5])) {
1275 modules[midx].bm_hash =
1276 (native_ptr_t)(uintptr_t)(q + 5);
1277 }
1278 continue;
1279 }
1280
1281 dboot_printf("ignoring unknown option '%s'\n", q);
1282 }
1283 }
1284
1285 /*
1286 * Backward compatibility: if there are exactly one or two modules, both
1287 * of type 'file' and neither with an embedded hash value, we have been
1288 * given the legacy style modules. In this case we need to treat the first
1289 * module as a rootfs and the second as a hash referencing that module.
1290 * Otherwise, even if the configuration is invalid, we assume that the
1291 * operator knows what he's doing or at least isn't being bitten by this
1292 * interface change.
1293 */
1294 static void
fixup_modules(void)1295 fixup_modules(void)
1296 {
1297 if (modules_used == 0 || modules_used > 2)
1298 return;
1299
1300 if (modules[0].bm_type != BMT_FILE ||
1301 modules_used > 1 && modules[1].bm_type != BMT_FILE) {
1302 return;
1303 }
1304
1305 if (modules[0].bm_hash != NULL ||
1306 modules_used > 1 && modules[1].bm_hash != NULL) {
1307 return;
1308 }
1309
1310 modules[0].bm_type = BMT_ROOTFS;
1311 if (modules_used > 1) {
1312 modules[1].bm_type = BMT_HASH;
1313 modules[1].bm_name = modules[0].bm_name;
1314 }
1315 }
1316
1317 /*
1318 * For modules that do not have assigned hashes but have a separate hash module,
1319 * find the assigned hash module and set the primary module's bm_hash to point
1320 * to the hash data from that module. We will then ignore modules of type
1321 * BMT_HASH from this point forward.
1322 */
1323 static void
assign_module_hashes(void)1324 assign_module_hashes(void)
1325 {
1326 uint_t i, j;
1327
1328 for (i = 0; i < modules_used; i++) {
1329 if (modules[i].bm_type == BMT_HASH ||
1330 modules[i].bm_hash != NULL) {
1331 continue;
1332 }
1333
1334 for (j = 0; j < modules_used; j++) {
1335 if (modules[j].bm_type != BMT_HASH ||
1336 strcmp((char *)(uintptr_t)modules[j].bm_name,
1337 (char *)(uintptr_t)modules[i].bm_name) != 0) {
1338 continue;
1339 }
1340
1341 if (modules[j].bm_size < SHA1_ASCII_LENGTH) {
1342 dboot_printf("Short hash module of length "
1343 "0x%lx bytes; ignoring\n",
1344 (ulong_t)modules[j].bm_size);
1345 } else {
1346 modules[i].bm_hash = modules[j].bm_addr;
1347 }
1348 break;
1349 }
1350 }
1351 }
1352
1353 /*
1354 * Walk through the module information finding the last used address.
1355 * The first available address will become the top level page table.
1356 */
1357 static void
dboot_process_modules(void)1358 dboot_process_modules(void)
1359 {
1360 int i, modcount;
1361 extern char _end[];
1362
1363 DBG_MSG("\nFinding Modules\n");
1364 modcount = dboot_multiboot_modcount();
1365 if (modcount > MAX_BOOT_MODULES) {
1366 dboot_panic("Too many modules (%d) -- the maximum is %d.",
1367 modcount, MAX_BOOT_MODULES);
1368 }
1369 /*
1370 * search the modules to find the last used address
1371 * we'll build the module list while we're walking through here
1372 */
1373 check_higher((paddr_t)(uintptr_t)&_end);
1374 for (i = 0; i < modcount; ++i) {
1375 process_module(i);
1376 modules_used++;
1377 }
1378 bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1379 DBG(bi->bi_modules);
1380 bi->bi_module_cnt = modcount;
1381 DBG(bi->bi_module_cnt);
1382
1383 fixup_modules();
1384 assign_module_hashes();
1385 check_images();
1386 }
1387
1388 /*
1389 * We then build the phys_install memlist from the multiboot information.
1390 */
1391 static void
dboot_process_mmap(void)1392 dboot_process_mmap(void)
1393 {
1394 uint64_t start;
1395 uint64_t end;
1396 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
1397 uint32_t lower, upper;
1398 int i, mmap_entries;
1399
1400 /*
1401 * Walk through the memory map from multiboot and build our memlist
1402 * structures. Note these will have native format pointers.
1403 */
1404 DBG_MSG("\nFinding Memory Map\n");
1405 num_entries = 0;
1406 num_entries_set = B_FALSE;
1407 max_mem = 0;
1408 if ((mmap_entries = dboot_loader_mmap_entries()) > 0) {
1409 for (i = 0; i < mmap_entries; i++) {
1410 uint32_t type = dboot_loader_mmap_get_type(i);
1411 start = dboot_loader_mmap_get_base(i);
1412 end = start + dboot_loader_mmap_get_length(i);
1413
1414 if (prom_debug)
1415 dboot_printf("\ttype: %d %" PRIx64 "..%"
1416 PRIx64 "\n", type, start, end);
1417
1418 /*
1419 * page align start and end
1420 */
1421 start = (start + page_offset) & ~page_offset;
1422 end &= ~page_offset;
1423 if (end <= start)
1424 continue;
1425
1426 /*
1427 * only type 1 is usable RAM
1428 */
1429 switch (type) {
1430 case 1:
1431 if (end > max_mem)
1432 max_mem = end;
1433 memlists[memlists_used].addr = start;
1434 memlists[memlists_used].size = end - start;
1435 ++memlists_used;
1436 if (memlists_used > MAX_MEMLIST)
1437 dboot_panic("too many memlists");
1438 break;
1439 case 2:
1440 rsvdmemlists[rsvdmemlists_used].addr = start;
1441 rsvdmemlists[rsvdmemlists_used].size =
1442 end - start;
1443 ++rsvdmemlists_used;
1444 if (rsvdmemlists_used > MAX_MEMLIST)
1445 dboot_panic("too many rsvdmemlists");
1446 break;
1447 default:
1448 continue;
1449 }
1450 }
1451 build_pcimemlists();
1452 } else if (dboot_multiboot_basicmeminfo(&lower, &upper)) {
1453 DBG(lower);
1454 memlists[memlists_used].addr = 0;
1455 memlists[memlists_used].size = lower * 1024;
1456 ++memlists_used;
1457 DBG(upper);
1458 memlists[memlists_used].addr = 1024 * 1024;
1459 memlists[memlists_used].size = upper * 1024;
1460 ++memlists_used;
1461
1462 /*
1463 * Old platform - assume I/O space at the end of memory.
1464 */
1465 pcimemlists[0].addr = (upper * 1024) + (1024 * 1024);
1466 pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
1467 pcimemlists[0].next = 0;
1468 pcimemlists[0].prev = 0;
1469 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
1470 DBG(bi->bi_pcimem);
1471 } else {
1472 dboot_panic("No memory info from boot loader!!!");
1473 }
1474
1475 /*
1476 * finish processing the physinstall list
1477 */
1478 sort_physinstall();
1479
1480 /*
1481 * build bios reserved mem lists
1482 */
1483 build_rsvdmemlists();
1484 }
1485
1486 /*
1487 * The highest address is used as the starting point for dboot's simple
1488 * memory allocator.
1489 *
1490 * Finding the highest address in case of Multiboot 1 protocol is
1491 * quite painful in the sense that some information provided by
1492 * the multiboot info structure points to BIOS data, and some to RAM.
1493 *
1494 * The module list was processed and checked already by dboot_process_modules(),
1495 * so we will check the command line string and the memory map.
1496 *
1497 * This list of to be checked items is based on our current knowledge of
1498 * allocations made by grub1 and will need to be reviewed if there
1499 * are updates about the information provided by Multiboot 1.
1500 *
1501 * In the case of the Multiboot 2, our life is much simpler, as the MB2
1502 * information tag list is one contiguous chunk of memory.
1503 */
1504 static paddr_t
dboot_multiboot1_highest_addr(void)1505 dboot_multiboot1_highest_addr(void)
1506 {
1507 paddr_t addr = NULL;
1508 char *cmdl = (char *)mb_info->cmdline;
1509
1510 if (mb_info->flags & MB_INFO_CMDLINE)
1511 addr = ((paddr_t)((uintptr_t)cmdl + strlen(cmdl) + 1));
1512
1513 if (mb_info->flags & MB_INFO_MEM_MAP)
1514 addr = MAX(addr,
1515 ((paddr_t)(mb_info->mmap_addr + mb_info->mmap_length)));
1516 return (addr);
1517 }
1518
1519 static void
dboot_multiboot_highest_addr(void)1520 dboot_multiboot_highest_addr(void)
1521 {
1522 paddr_t addr;
1523
1524 switch (multiboot_version) {
1525 case 1:
1526 addr = dboot_multiboot1_highest_addr();
1527 if (addr != NULL)
1528 check_higher(addr);
1529 break;
1530 case 2:
1531 addr = dboot_multiboot2_highest_addr(mb2_info);
1532 if (addr != NULL)
1533 check_higher(addr);
1534 break;
1535 default:
1536 dboot_panic("Unknown multiboot version: %d\n",
1537 multiboot_version);
1538 break;
1539 }
1540 }
1541
1542 /*
1543 * Walk the boot loader provided information and find the highest free address.
1544 */
1545 static void
init_mem_alloc(void)1546 init_mem_alloc(void)
1547 {
1548 DBG_MSG("Entered init_mem_alloc()\n");
1549 dboot_process_modules();
1550 dboot_process_mmap();
1551 dboot_multiboot_highest_addr();
1552 }
1553
1554 static void
dboot_multiboot_get_fwtables(void)1555 dboot_multiboot_get_fwtables(void)
1556 {
1557 multiboot_tag_new_acpi_t *nacpitagp;
1558 multiboot_tag_old_acpi_t *oacpitagp;
1559
1560 /* no fw tables from multiboot 1 */
1561 if (multiboot_version != 2)
1562 return;
1563
1564 nacpitagp = (multiboot_tag_new_acpi_t *)
1565 dboot_multiboot2_find_tag(mb2_info,
1566 MULTIBOOT_TAG_TYPE_ACPI_NEW);
1567 oacpitagp = (multiboot_tag_old_acpi_t *)
1568 dboot_multiboot2_find_tag(mb2_info,
1569 MULTIBOOT_TAG_TYPE_ACPI_OLD);
1570
1571 if (nacpitagp != NULL) {
1572 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1573 &nacpitagp->mb_rsdp[0];
1574 } else if (oacpitagp != NULL) {
1575 bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1576 &oacpitagp->mb_rsdp[0];
1577 } else {
1578 bi->bi_acpi_rsdp = NULL;
1579 }
1580 }
1581 #endif /* !__xpv */
1582
1583 /*
1584 * Simple memory allocator, allocates aligned physical memory.
1585 * Note that startup_kernel() only allocates memory, never frees.
1586 * Memory usage just grows in an upward direction.
1587 */
1588 static void *
do_mem_alloc(uint32_t size,uint32_t align)1589 do_mem_alloc(uint32_t size, uint32_t align)
1590 {
1591 uint_t i;
1592 uint64_t best;
1593 uint64_t start;
1594 uint64_t end;
1595
1596 /*
1597 * make sure size is a multiple of pagesize
1598 */
1599 size = RNDUP(size, MMU_PAGESIZE);
1600 next_avail_addr = RNDUP(next_avail_addr, align);
1601
1602 /*
1603 * XXPV fixme joe
1604 *
1605 * a really large bootarchive that causes you to run out of memory
1606 * may cause this to blow up
1607 */
1608 /* LINTED E_UNEXPECTED_UINT_PROMOTION */
1609 best = (uint64_t)-size;
1610 for (i = 0; i < memlists_used; ++i) {
1611 start = memlists[i].addr;
1612 #if defined(__xpv)
1613 start += mfn_base;
1614 #endif
1615 end = start + memlists[i].size;
1616
1617 /*
1618 * did we find the desired address?
1619 */
1620 if (start <= next_avail_addr && next_avail_addr + size <= end) {
1621 best = next_avail_addr;
1622 goto done;
1623 }
1624
1625 /*
1626 * if not is this address the best so far?
1627 */
1628 if (start > next_avail_addr && start < best &&
1629 RNDUP(start, align) + size <= end)
1630 best = RNDUP(start, align);
1631 }
1632
1633 /*
1634 * We didn't find exactly the address we wanted, due to going off the
1635 * end of a memory region. Return the best found memory address.
1636 */
1637 done:
1638 next_avail_addr = best + size;
1639 #if defined(__xpv)
1640 if (next_avail_addr > scratch_end)
1641 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
1642 "0x%lx", (ulong_t)next_avail_addr,
1643 (ulong_t)scratch_end);
1644 #endif
1645 (void) memset((void *)(uintptr_t)best, 0, size);
1646 return ((void *)(uintptr_t)best);
1647 }
1648
1649 void *
mem_alloc(uint32_t size)1650 mem_alloc(uint32_t size)
1651 {
1652 return (do_mem_alloc(size, MMU_PAGESIZE));
1653 }
1654
1655
1656 /*
1657 * Build page tables to map all of memory used so far as well as the kernel.
1658 */
1659 static void
build_page_tables(void)1660 build_page_tables(void)
1661 {
1662 uint32_t psize;
1663 uint32_t level;
1664 uint32_t off;
1665 uint64_t start;
1666 #if !defined(__xpv)
1667 uint32_t i;
1668 uint64_t end;
1669 #endif /* __xpv */
1670
1671 /*
1672 * If we're on metal, we need to create the top level pagetable.
1673 */
1674 #if defined(__xpv)
1675 top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1676 #else /* __xpv */
1677 top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1678 #endif /* __xpv */
1679 DBG((uintptr_t)top_page_table);
1680
1681 /*
1682 * Determine if we'll use large mappings for kernel, then map it.
1683 */
1684 if (largepage_support) {
1685 psize = lpagesize;
1686 level = 1;
1687 } else {
1688 psize = MMU_PAGESIZE;
1689 level = 0;
1690 }
1691
1692 DBG_MSG("Mapping kernel\n");
1693 DBG(ktext_phys);
1694 DBG(target_kernel_text);
1695 DBG(ksize);
1696 DBG(psize);
1697 for (off = 0; off < ksize; off += psize)
1698 map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1699
1700 /*
1701 * The kernel will need a 1 page window to work with page tables
1702 */
1703 bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE);
1704 DBG(bi->bi_pt_window);
1705 bi->bi_pte_to_pt_window =
1706 (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1707 DBG(bi->bi_pte_to_pt_window);
1708
1709 #if defined(__xpv)
1710 if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1711 /* If this is a domU we're done. */
1712 DBG_MSG("\nPage tables constructed\n");
1713 return;
1714 }
1715 #endif /* __xpv */
1716
1717 /*
1718 * We need 1:1 mappings for the lower 1M of memory to access
1719 * BIOS tables used by a couple of drivers during boot.
1720 *
1721 * The following code works because our simple memory allocator
1722 * only grows usage in an upwards direction.
1723 *
1724 * Note that by this point in boot some mappings for low memory
1725 * may already exist because we've already accessed device in low
1726 * memory. (Specifically the video frame buffer and keyboard
1727 * status ports.) If we're booting on raw hardware then GRUB
1728 * created these mappings for us. If we're booting under a
1729 * hypervisor then we went ahead and remapped these devices into
1730 * memory allocated within dboot itself.
1731 */
1732 if (map_debug)
1733 dboot_printf("1:1 map pa=0..1Meg\n");
1734 for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
1735 #if defined(__xpv)
1736 map_ma_at_va(start, start, 0);
1737 #else /* __xpv */
1738 map_pa_at_va(start, start, 0);
1739 #endif /* __xpv */
1740 }
1741
1742 #if !defined(__xpv)
1743 for (i = 0; i < memlists_used; ++i) {
1744 start = memlists[i].addr;
1745
1746 end = start + memlists[i].size;
1747
1748 if (map_debug)
1749 dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
1750 start, end);
1751 while (start < end && start < next_avail_addr) {
1752 map_pa_at_va(start, start, 0);
1753 start += MMU_PAGESIZE;
1754 }
1755 }
1756 #endif /* !__xpv */
1757
1758 DBG_MSG("\nPage tables constructed\n");
1759 }
1760
1761 #define NO_MULTIBOOT \
1762 "multiboot is no longer used to boot the Solaris Operating System.\n\
1763 The grub entry should be changed to:\n\
1764 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
1765 module$ /platform/i86pc/$ISADIR/boot_archive\n\
1766 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
1767
1768 static void
dboot_init_xboot_consinfo(void)1769 dboot_init_xboot_consinfo(void)
1770 {
1771 uintptr_t addr;
1772 /*
1773 * boot info must be 16 byte aligned for 64 bit kernel ABI
1774 */
1775 addr = (uintptr_t)boot_info;
1776 addr = (addr + 0xf) & ~0xf;
1777 bi = (struct xboot_info *)addr;
1778
1779 #if !defined(__xpv)
1780 switch (multiboot_version) {
1781 case 1:
1782 dboot_multiboot1_xboot_consinfo();
1783 break;
1784 case 2:
1785 dboot_multiboot2_xboot_consinfo();
1786 break;
1787 default:
1788 dboot_panic("Unknown multiboot version: %d\n",
1789 multiboot_version);
1790 break;
1791 }
1792 #endif
1793 }
1794
1795 /*
1796 * Set up basic data from the boot loader.
1797 * The load_addr is part of AOUT kludge setup in dboot_grub.s, to support
1798 * 32-bit dboot code setup used to set up and start 64-bit kernel.
1799 * AOUT kludge does allow 32-bit boot loader, such as grub1, to load and
1800 * start 64-bit illumos kernel.
1801 */
1802 static void
dboot_loader_init(void)1803 dboot_loader_init(void)
1804 {
1805 #if !defined(__xpv)
1806 mb_info = NULL;
1807 mb2_info = NULL;
1808
1809 switch (mb_magic) {
1810 case MB_BOOTLOADER_MAGIC:
1811 multiboot_version = 1;
1812 mb_info = (multiboot_info_t *)(uintptr_t)mb_addr;
1813 #if defined(_BOOT_TARGET_amd64)
1814 load_addr = mb_header.load_addr;
1815 #endif
1816 break;
1817
1818 case MULTIBOOT2_BOOTLOADER_MAGIC:
1819 multiboot_version = 2;
1820 mb2_info = (multiboot2_info_header_t *)(uintptr_t)mb_addr;
1821 mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mb2_info);
1822 #if defined(_BOOT_TARGET_amd64)
1823 load_addr = mb2_load_addr;
1824 #endif
1825 break;
1826
1827 default:
1828 dboot_panic("Unknown bootloader magic: 0x%x\n", mb_magic);
1829 break;
1830 }
1831 #endif /* !defined(__xpv) */
1832 }
1833
1834 /* Extract the kernel command line from [multi]boot information. */
1835 static char *
dboot_loader_cmdline(void)1836 dboot_loader_cmdline(void)
1837 {
1838 char *line = NULL;
1839
1840 #if defined(__xpv)
1841 line = (char *)xen_info->cmd_line;
1842 #else /* __xpv */
1843
1844 switch (multiboot_version) {
1845 case 1:
1846 if (mb_info->flags & MB_INFO_CMDLINE)
1847 line = (char *)mb_info->cmdline;
1848 break;
1849
1850 case 2:
1851 line = dboot_multiboot2_cmdline(mb2_info);
1852 break;
1853
1854 default:
1855 dboot_panic("Unknown multiboot version: %d\n",
1856 multiboot_version);
1857 break;
1858 }
1859
1860 #endif /* __xpv */
1861
1862 /*
1863 * Make sure we have valid pointer so the string operations
1864 * will not crash us.
1865 */
1866 if (line == NULL)
1867 line = "";
1868
1869 return (line);
1870 }
1871
1872 static char *
dboot_loader_name(void)1873 dboot_loader_name(void)
1874 {
1875 #if defined(__xpv)
1876 return (NULL);
1877 #else /* __xpv */
1878 multiboot_tag_string_t *tag;
1879
1880 switch (multiboot_version) {
1881 case 1:
1882 return ((char *)mb_info->boot_loader_name);
1883
1884 case 2:
1885 tag = dboot_multiboot2_find_tag(mb2_info,
1886 MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME);
1887 return (tag->mb_string);
1888 default:
1889 dboot_panic("Unknown multiboot version: %d\n",
1890 multiboot_version);
1891 break;
1892 }
1893
1894 return (NULL);
1895 #endif /* __xpv */
1896 }
1897 /*
1898 * startup_kernel has a pretty simple job. It builds pagetables which reflect
1899 * 1:1 mappings for all memory in use. It then also adds mappings for
1900 * the kernel nucleus at virtual address of target_kernel_text using large page
1901 * mappings. The page table pages are also accessible at 1:1 mapped
1902 * virtual addresses.
1903 */
1904 /*ARGSUSED*/
1905 void
startup_kernel(void)1906 startup_kernel(void)
1907 {
1908 char *cmdline;
1909 char *bootloader;
1910 #if defined(__xpv)
1911 physdev_set_iopl_t set_iopl;
1912 #endif /* __xpv */
1913
1914 dboot_loader_init();
1915 /*
1916 * At this point we are executing in a 32 bit real mode.
1917 */
1918
1919 bootloader = dboot_loader_name();
1920 cmdline = dboot_loader_cmdline();
1921
1922 prom_debug = (strstr(cmdline, "prom_debug") != NULL);
1923 map_debug = (strstr(cmdline, "map_debug") != NULL);
1924
1925 #if defined(__xpv)
1926 /*
1927 * For dom0, before we initialize the console subsystem we'll
1928 * need to enable io operations, so set I/O priveldge level to 1.
1929 */
1930 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1931 set_iopl.iopl = 1;
1932 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1933 }
1934 #endif /* __xpv */
1935
1936 dboot_init_xboot_consinfo();
1937 bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
1938
1939 #if !defined(__xpv)
1940 dboot_multiboot_get_fwtables();
1941 #endif
1942 bcons_init(cmdline);
1943 DBG_MSG("\n\nillumos prekernel set: ");
1944 DBG_MSG(cmdline);
1945 DBG_MSG("\n");
1946
1947 if (bootloader != NULL && prom_debug) {
1948 dboot_printf("Kernel loaded by: %s\n", bootloader);
1949 #if !defined(__xpv)
1950 dboot_printf("Using multiboot %d boot protocol.\n",
1951 multiboot_version);
1952 #endif
1953 }
1954
1955 if (strstr(cmdline, "multiboot") != NULL) {
1956 dboot_panic(NO_MULTIBOOT);
1957 }
1958
1959 DBG((uintptr_t)bi);
1960 #if !defined(__xpv)
1961 DBG((uintptr_t)mb_info);
1962 DBG((uintptr_t)mb2_info);
1963 if (mb2_info != NULL)
1964 DBG(mb2_info->mbi_total_size);
1965 DBG(bi->bi_acpi_rsdp);
1966 #endif
1967
1968 /*
1969 * Need correct target_kernel_text value
1970 */
1971 #if defined(_BOOT_TARGET_amd64)
1972 target_kernel_text = KERNEL_TEXT_amd64;
1973 #elif defined(__xpv)
1974 target_kernel_text = KERNEL_TEXT_i386_xpv;
1975 #else
1976 target_kernel_text = KERNEL_TEXT_i386;
1977 #endif
1978 DBG(target_kernel_text);
1979
1980 #if defined(__xpv)
1981
1982 /*
1983 * XXPV Derive this stuff from CPUID / what the hypervisor has enabled
1984 */
1985
1986 #if defined(_BOOT_TARGET_amd64)
1987 /*
1988 * 64-bit hypervisor.
1989 */
1990 amd64_support = 1;
1991 pae_support = 1;
1992
1993 #else /* _BOOT_TARGET_amd64 */
1994
1995 /*
1996 * See if we are running on a PAE Hypervisor
1997 */
1998 {
1999 xen_capabilities_info_t caps;
2000
2001 if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
2002 dboot_panic("HYPERVISOR_xen_version(caps) failed");
2003 caps[sizeof (caps) - 1] = 0;
2004 if (prom_debug)
2005 dboot_printf("xen capabilities %s\n", caps);
2006 if (strstr(caps, "x86_32p") != NULL)
2007 pae_support = 1;
2008 }
2009
2010 #endif /* _BOOT_TARGET_amd64 */
2011 {
2012 xen_platform_parameters_t p;
2013
2014 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
2015 dboot_panic("HYPERVISOR_xen_version(parms) failed");
2016 DBG(p.virt_start);
2017 mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
2018 }
2019
2020 /*
2021 * The hypervisor loads stuff starting at 1Gig
2022 */
2023 mfn_base = ONE_GIG;
2024 DBG(mfn_base);
2025
2026 /*
2027 * enable writable page table mode for the hypervisor
2028 */
2029 if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2030 VMASST_TYPE_writable_pagetables) < 0)
2031 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
2032
2033 /*
2034 * check for NX support
2035 */
2036 if (pae_support) {
2037 uint32_t eax = 0x80000000;
2038 uint32_t edx = get_cpuid_edx(&eax);
2039
2040 if (eax >= 0x80000001) {
2041 eax = 0x80000001;
2042 edx = get_cpuid_edx(&eax);
2043 if (edx & CPUID_AMD_EDX_NX)
2044 NX_support = 1;
2045 }
2046 }
2047
2048 #if !defined(_BOOT_TARGET_amd64)
2049
2050 /*
2051 * The 32-bit hypervisor uses segmentation to protect itself from
2052 * guests. This means when a guest attempts to install a flat 4GB
2053 * code or data descriptor the 32-bit hypervisor will protect itself
2054 * by silently shrinking the segment such that if the guest attempts
2055 * any access where the hypervisor lives a #gp fault is generated.
2056 * The problem is that some applications expect a full 4GB flat
2057 * segment for their current thread pointer and will use negative
2058 * offset segment wrap around to access data. TLS support in linux
2059 * brand is one example of this.
2060 *
2061 * The 32-bit hypervisor can catch the #gp fault in these cases
2062 * and emulate the access without passing the #gp fault to the guest
2063 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
2064 * Seems like this should have been the default.
2065 * Either way, we want the hypervisor -- and not Solaris -- to deal
2066 * to deal with emulating these accesses.
2067 */
2068 if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2069 VMASST_TYPE_4gb_segments) < 0)
2070 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
2071 #endif /* !_BOOT_TARGET_amd64 */
2072
2073 #else /* __xpv */
2074
2075 /*
2076 * use cpuid to enable MMU features
2077 */
2078 if (have_cpuid()) {
2079 uint32_t eax, edx;
2080
2081 eax = 1;
2082 edx = get_cpuid_edx(&eax);
2083 if (edx & CPUID_INTC_EDX_PSE)
2084 largepage_support = 1;
2085 if (edx & CPUID_INTC_EDX_PGE)
2086 pge_support = 1;
2087 if (edx & CPUID_INTC_EDX_PAE)
2088 pae_support = 1;
2089
2090 eax = 0x80000000;
2091 edx = get_cpuid_edx(&eax);
2092 if (eax >= 0x80000001) {
2093 eax = 0x80000001;
2094 edx = get_cpuid_edx(&eax);
2095 if (edx & CPUID_AMD_EDX_LM)
2096 amd64_support = 1;
2097 if (edx & CPUID_AMD_EDX_NX)
2098 NX_support = 1;
2099 }
2100 } else {
2101 dboot_printf("cpuid not supported\n");
2102 }
2103 #endif /* __xpv */
2104
2105
2106 #if defined(_BOOT_TARGET_amd64)
2107 if (amd64_support == 0)
2108 dboot_panic("long mode not supported, rebooting");
2109 else if (pae_support == 0)
2110 dboot_panic("long mode, but no PAE; rebooting");
2111 #else
2112 /*
2113 * Allow the command line to over-ride use of PAE for 32 bit.
2114 */
2115 if (strstr(cmdline, "disablePAE=true") != NULL) {
2116 pae_support = 0;
2117 NX_support = 0;
2118 amd64_support = 0;
2119 }
2120 #endif
2121
2122 /*
2123 * initialize the simple memory allocator
2124 */
2125 init_mem_alloc();
2126
2127 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
2128 /*
2129 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
2130 */
2131 if (max_mem < FOUR_GIG && NX_support == 0)
2132 pae_support = 0;
2133 #endif
2134
2135 /*
2136 * configure mmu information
2137 */
2138 if (pae_support) {
2139 shift_amt = shift_amt_pae;
2140 ptes_per_table = 512;
2141 pte_size = 8;
2142 lpagesize = TWO_MEG;
2143 #if defined(_BOOT_TARGET_amd64)
2144 top_level = 3;
2145 #else
2146 top_level = 2;
2147 #endif
2148 } else {
2149 pae_support = 0;
2150 NX_support = 0;
2151 shift_amt = shift_amt_nopae;
2152 ptes_per_table = 1024;
2153 pte_size = 4;
2154 lpagesize = FOUR_MEG;
2155 top_level = 1;
2156 }
2157
2158 DBG(pge_support);
2159 DBG(NX_support);
2160 DBG(largepage_support);
2161 DBG(amd64_support);
2162 DBG(top_level);
2163 DBG(pte_size);
2164 DBG(ptes_per_table);
2165 DBG(lpagesize);
2166
2167 #if defined(__xpv)
2168 ktext_phys = ONE_GIG; /* from UNIX Mapfile */
2169 #else
2170 ktext_phys = FOUR_MEG; /* from UNIX Mapfile */
2171 #endif
2172
2173 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
2174 /*
2175 * For grub, copy kernel bits from the ELF64 file to final place.
2176 */
2177 DBG_MSG("\nAllocating nucleus pages.\n");
2178 ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
2179 if (ktext_phys == 0)
2180 dboot_panic("failed to allocate aligned kernel memory");
2181 DBG(load_addr);
2182 if (dboot_elfload64(load_addr) != 0)
2183 dboot_panic("failed to parse kernel ELF image, rebooting");
2184 #endif
2185
2186 DBG(ktext_phys);
2187
2188 /*
2189 * Allocate page tables.
2190 */
2191 build_page_tables();
2192
2193 /*
2194 * return to assembly code to switch to running kernel
2195 */
2196 entry_addr_low = (uint32_t)target_kernel_text;
2197 DBG(entry_addr_low);
2198 bi->bi_use_largepage = largepage_support;
2199 bi->bi_use_pae = pae_support;
2200 bi->bi_use_pge = pge_support;
2201 bi->bi_use_nx = NX_support;
2202
2203 #if defined(__xpv)
2204
2205 bi->bi_next_paddr = next_avail_addr - mfn_base;
2206 DBG(bi->bi_next_paddr);
2207 bi->bi_next_vaddr = (native_ptr_t)next_avail_addr;
2208 DBG(bi->bi_next_vaddr);
2209
2210 /*
2211 * unmap unused pages in start area to make them available for DMA
2212 */
2213 while (next_avail_addr < scratch_end) {
2214 (void) HYPERVISOR_update_va_mapping(next_avail_addr,
2215 0, UVMF_INVLPG | UVMF_LOCAL);
2216 next_avail_addr += MMU_PAGESIZE;
2217 }
2218
2219 bi->bi_xen_start_info = (uintptr_t)xen_info;
2220 DBG((uintptr_t)HYPERVISOR_shared_info);
2221 bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
2222 bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
2223
2224 #else /* __xpv */
2225
2226 bi->bi_next_paddr = next_avail_addr;
2227 DBG(bi->bi_next_paddr);
2228 bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
2229 DBG(bi->bi_next_vaddr);
2230 bi->bi_mb_version = multiboot_version;
2231
2232 switch (multiboot_version) {
2233 case 1:
2234 bi->bi_mb_info = (uintptr_t)mb_info;
2235 break;
2236 case 2:
2237 bi->bi_mb_info = (uintptr_t)mb2_info;
2238 break;
2239 default:
2240 dboot_panic("Unknown multiboot version: %d\n",
2241 multiboot_version);
2242 break;
2243 }
2244 bi->bi_top_page_table = (uintptr_t)top_page_table;
2245
2246 #endif /* __xpv */
2247
2248 bi->bi_kseg_size = FOUR_MEG;
2249 DBG(bi->bi_kseg_size);
2250
2251 #ifndef __xpv
2252 if (map_debug)
2253 dump_tables();
2254 #endif
2255
2256 DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
2257 }
2258