1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 *
26 * Copyright 2013 Joyent, Inc. All rights reserved.
27 */
28
29
30 #include <sys/types.h>
31 #include <sys/machparam.h>
32 #include <sys/x86_archext.h>
33 #include <sys/systm.h>
34 #include <sys/mach_mmu.h>
35 #include <sys/multiboot.h>
36 #include <sys/sha1.h>
37 #include <util/string.h>
38 #include <util/strtolctype.h>
39
40 #if defined(__xpv)
41
42 #include <sys/hypervisor.h>
43 uintptr_t xen_virt_start;
44 pfn_t *mfn_to_pfn_mapping;
45
46 #else /* !__xpv */
47
48 extern multiboot_header_t mb_header;
49 extern int have_cpuid(void);
50
51 #endif /* !__xpv */
52
53 #include <sys/inttypes.h>
54 #include <sys/bootinfo.h>
55 #include <sys/mach_mmu.h>
56 #include <sys/boot_console.h>
57
58 #include "dboot_asm.h"
59 #include "dboot_printf.h"
60 #include "dboot_xboot.h"
61 #include "dboot_elfload.h"
62
63 #define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2)
64
65 /*
66 * This file contains code that runs to transition us from either a multiboot
67 * compliant loader (32 bit non-paging) or a XPV domain loader to
68 * regular kernel execution. Its task is to setup the kernel memory image
69 * and page tables.
70 *
71 * The code executes as:
72 * - 32 bits under GRUB (for 32 or 64 bit Solaris)
73 * - a 32 bit program for the 32-bit PV hypervisor
74 * - a 64 bit program for the 64-bit PV hypervisor (at least for now)
75 *
76 * Under the PV hypervisor, we must create mappings for any memory beyond the
77 * initial start of day allocation (such as the kernel itself).
78 *
79 * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
80 * Since we are running in real mode, so all such memory is accessible.
81 */
82
83 /*
84 * Standard bits used in PTE (page level) and PTP (internal levels)
85 */
86 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
87 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
88
89 /*
90 * This is the target addresses (physical) where the kernel text and data
91 * nucleus pages will be unpacked. On the hypervisor this is actually a
92 * virtual address.
93 */
94 paddr_t ktext_phys;
95 uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */
96
97 static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */
98
99 /*
100 * The stack is setup in assembler before entering startup_kernel()
101 */
102 char stack_space[STACK_SIZE];
103
104 /*
105 * Used to track physical memory allocation
106 */
107 static paddr_t next_avail_addr = 0;
108
109 #if defined(__xpv)
110 /*
111 * Additional information needed for hypervisor memory allocation.
112 * Only memory up to scratch_end is mapped by page tables.
113 * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
114 * to derive a pfn from a pointer, you subtract mfn_base.
115 */
116
117 static paddr_t scratch_end = 0; /* we can't write all of mem here */
118 static paddr_t mfn_base; /* addr corresponding to mfn_list[0] */
119 start_info_t *xen_info;
120
121 #else /* __xpv */
122
123 /*
124 * If on the metal, then we have a multiboot loader.
125 */
126 multiboot_info_t *mb_info;
127
128 #endif /* __xpv */
129
130 /*
131 * This contains information passed to the kernel
132 */
133 struct xboot_info boot_info[2]; /* extra space to fix alignement for amd64 */
134 struct xboot_info *bi;
135
136 /*
137 * Page table and memory stuff.
138 */
139 static paddr_t max_mem; /* maximum memory address */
140
141 /*
142 * Information about processor MMU
143 */
144 int amd64_support = 0;
145 int largepage_support = 0;
146 int pae_support = 0;
147 int pge_support = 0;
148 int NX_support = 0;
149
150 /*
151 * Low 32 bits of kernel entry address passed back to assembler.
152 * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
153 */
154 uint32_t entry_addr_low;
155
156 /*
157 * Memlists for the kernel. We shouldn't need a lot of these.
158 */
159 #define MAX_MEMLIST (50)
160 struct boot_memlist memlists[MAX_MEMLIST];
161 uint_t memlists_used = 0;
162 struct boot_memlist pcimemlists[MAX_MEMLIST];
163 uint_t pcimemlists_used = 0;
164 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
165 uint_t rsvdmemlists_used = 0;
166
167 /*
168 * This should match what's in the bootloader. It's arbitrary, but GRUB
169 * in particular has limitations on how much space it can use before it
170 * stops working properly. This should be enough.
171 */
172 struct boot_modules modules[MAX_BOOT_MODULES];
173 uint_t modules_used = 0;
174
175 /*
176 * Debugging macros
177 */
178 uint_t prom_debug = 0;
179 uint_t map_debug = 0;
180
181 static char noname[2] = "-";
182
183 /*
184 * Either hypervisor-specific or grub-specific code builds the initial
185 * memlists. This code does the sort/merge/link for final use.
186 */
187 static void
sort_physinstall(void)188 sort_physinstall(void)
189 {
190 int i;
191 #if !defined(__xpv)
192 int j;
193 struct boot_memlist tmp;
194
195 /*
196 * Now sort the memlists, in case they weren't in order.
197 * Yeah, this is a bubble sort; small, simple and easy to get right.
198 */
199 DBG_MSG("Sorting phys-installed list\n");
200 for (j = memlists_used - 1; j > 0; --j) {
201 for (i = 0; i < j; ++i) {
202 if (memlists[i].addr < memlists[i + 1].addr)
203 continue;
204 tmp = memlists[i];
205 memlists[i] = memlists[i + 1];
206 memlists[i + 1] = tmp;
207 }
208 }
209
210 /*
211 * Merge any memlists that don't have holes between them.
212 */
213 for (i = 0; i <= memlists_used - 1; ++i) {
214 if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
215 continue;
216
217 if (prom_debug)
218 dboot_printf(
219 "merging mem segs %" PRIx64 "...%" PRIx64
220 " w/ %" PRIx64 "...%" PRIx64 "\n",
221 memlists[i].addr,
222 memlists[i].addr + memlists[i].size,
223 memlists[i + 1].addr,
224 memlists[i + 1].addr + memlists[i + 1].size);
225
226 memlists[i].size += memlists[i + 1].size;
227 for (j = i + 1; j < memlists_used - 1; ++j)
228 memlists[j] = memlists[j + 1];
229 --memlists_used;
230 DBG(memlists_used);
231 --i; /* after merging we need to reexamine, so do this */
232 }
233 #endif /* __xpv */
234
235 if (prom_debug) {
236 dboot_printf("\nFinal memlists:\n");
237 for (i = 0; i < memlists_used; ++i) {
238 dboot_printf("\t%d: addr=%" PRIx64 " size=%"
239 PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
240 }
241 }
242
243 /*
244 * link together the memlists with native size pointers
245 */
246 memlists[0].next = 0;
247 memlists[0].prev = 0;
248 for (i = 1; i < memlists_used; ++i) {
249 memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
250 memlists[i].next = 0;
251 memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
252 }
253 bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
254 DBG(bi->bi_phys_install);
255 }
256
257 /*
258 * build bios reserved memlists
259 */
260 static void
build_rsvdmemlists(void)261 build_rsvdmemlists(void)
262 {
263 int i;
264
265 rsvdmemlists[0].next = 0;
266 rsvdmemlists[0].prev = 0;
267 for (i = 1; i < rsvdmemlists_used; ++i) {
268 rsvdmemlists[i].prev =
269 (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
270 rsvdmemlists[i].next = 0;
271 rsvdmemlists[i - 1].next =
272 (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
273 }
274 bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
275 DBG(bi->bi_rsvdmem);
276 }
277
278 #if defined(__xpv)
279
280 /*
281 * halt on the hypervisor after a delay to drain console output
282 */
283 void
dboot_halt(void)284 dboot_halt(void)
285 {
286 uint_t i = 10000;
287
288 while (--i)
289 (void) HYPERVISOR_yield();
290 (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
291 }
292
293 /*
294 * From a machine address, find the corresponding pseudo-physical address.
295 * Pseudo-physical address are contiguous and run from mfn_base in each VM.
296 * Machine addresses are the real underlying hardware addresses.
297 * These are needed for page table entries. Note that this routine is
298 * poorly protected. A bad value of "ma" will cause a page fault.
299 */
300 paddr_t
ma_to_pa(maddr_t ma)301 ma_to_pa(maddr_t ma)
302 {
303 ulong_t pgoff = ma & MMU_PAGEOFFSET;
304 ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
305 paddr_t pa;
306
307 if (pfn >= xen_info->nr_pages)
308 return (-(paddr_t)1);
309 pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
310 #ifdef DEBUG
311 if (ma != pa_to_ma(pa))
312 dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
313 "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
314 #endif
315 return (pa);
316 }
317
318 /*
319 * From a pseudo-physical address, find the corresponding machine address.
320 */
321 maddr_t
pa_to_ma(paddr_t pa)322 pa_to_ma(paddr_t pa)
323 {
324 pfn_t pfn;
325 ulong_t mfn;
326
327 pfn = mmu_btop(pa - mfn_base);
328 if (pa < mfn_base || pfn >= xen_info->nr_pages)
329 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
330 mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
331 #ifdef DEBUG
332 if (mfn_to_pfn_mapping[mfn] != pfn)
333 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
334 pfn, mfn, mfn_to_pfn_mapping[mfn]);
335 #endif
336 return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
337 }
338
339 #endif /* __xpv */
340
341 x86pte_t
get_pteval(paddr_t table,uint_t index)342 get_pteval(paddr_t table, uint_t index)
343 {
344 if (pae_support)
345 return (((x86pte_t *)(uintptr_t)table)[index]);
346 return (((x86pte32_t *)(uintptr_t)table)[index]);
347 }
348
349 /*ARGSUSED*/
350 void
set_pteval(paddr_t table,uint_t index,uint_t level,x86pte_t pteval)351 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
352 {
353 #ifdef __xpv
354 mmu_update_t t;
355 maddr_t mtable = pa_to_ma(table);
356 int retcnt;
357
358 t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
359 t.val = pteval;
360 if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
361 dboot_panic("HYPERVISOR_mmu_update() failed");
362 #else /* __xpv */
363 uintptr_t tab_addr = (uintptr_t)table;
364
365 if (pae_support)
366 ((x86pte_t *)tab_addr)[index] = pteval;
367 else
368 ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
369 if (level == top_level && level == 2)
370 reload_cr3();
371 #endif /* __xpv */
372 }
373
374 paddr_t
make_ptable(x86pte_t * pteval,uint_t level)375 make_ptable(x86pte_t *pteval, uint_t level)
376 {
377 paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
378
379 if (level == top_level && level == 2)
380 *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
381 else
382 *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
383
384 #ifdef __xpv
385 /* Remove write permission to the new page table. */
386 if (HYPERVISOR_update_va_mapping(new_table,
387 *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
388 dboot_panic("HYP_update_va_mapping error");
389 #endif
390
391 if (map_debug)
392 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
393 PRIx64 "\n", level, (ulong_t)new_table, *pteval);
394 return (new_table);
395 }
396
397 x86pte_t *
map_pte(paddr_t table,uint_t index)398 map_pte(paddr_t table, uint_t index)
399 {
400 return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
401 }
402
403 /*
404 * dump out the contents of page tables...
405 */
406 static void
dump_tables(void)407 dump_tables(void)
408 {
409 uint_t save_index[4]; /* for recursion */
410 char *save_table[4]; /* for recursion */
411 uint_t l;
412 uint64_t va;
413 uint64_t pgsize;
414 int index;
415 int i;
416 x86pte_t pteval;
417 char *table;
418 static char *tablist = "\t\t\t";
419 char *tabs = tablist + 3 - top_level;
420 uint_t pa, pa1;
421 #if !defined(__xpv)
422 #define maddr_t paddr_t
423 #endif /* !__xpv */
424
425 dboot_printf("Finished pagetables:\n");
426 table = (char *)(uintptr_t)top_page_table;
427 l = top_level;
428 va = 0;
429 for (index = 0; index < ptes_per_table; ++index) {
430 pgsize = 1ull << shift_amt[l];
431 if (pae_support)
432 pteval = ((x86pte_t *)table)[index];
433 else
434 pteval = ((x86pte32_t *)table)[index];
435 if (pteval == 0)
436 goto next_entry;
437
438 dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
439 tabs + l, (void *)table, index, (uint64_t)pteval, va);
440 pa = ma_to_pa(pteval & MMU_PAGEMASK);
441 dboot_printf(" physaddr=%x\n", pa);
442
443 /*
444 * Don't try to walk hypervisor private pagetables
445 */
446 if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
447 save_table[l] = table;
448 save_index[l] = index;
449 --l;
450 index = -1;
451 table = (char *)(uintptr_t)
452 ma_to_pa(pteval & MMU_PAGEMASK);
453 goto recursion;
454 }
455
456 /*
457 * shorten dump for consecutive mappings
458 */
459 for (i = 1; index + i < ptes_per_table; ++i) {
460 if (pae_support)
461 pteval = ((x86pte_t *)table)[index + i];
462 else
463 pteval = ((x86pte32_t *)table)[index + i];
464 if (pteval == 0)
465 break;
466 pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
467 if (pa1 != pa + i * pgsize)
468 break;
469 }
470 if (i > 2) {
471 dboot_printf("%s...\n", tabs + l);
472 va += pgsize * (i - 2);
473 index += i - 2;
474 }
475 next_entry:
476 va += pgsize;
477 if (l == 3 && index == 256) /* VA hole */
478 va = 0xffff800000000000ull;
479 recursion:
480 ;
481 }
482 if (l < top_level) {
483 ++l;
484 index = save_index[l];
485 table = save_table[l];
486 goto recursion;
487 }
488 }
489
490 /*
491 * Add a mapping for the machine page at the given virtual address.
492 */
493 static void
map_ma_at_va(maddr_t ma,native_ptr_t va,uint_t level)494 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
495 {
496 x86pte_t *ptep;
497 x86pte_t pteval;
498
499 pteval = ma | pte_bits;
500 if (level > 0)
501 pteval |= PT_PAGESIZE;
502 if (va >= target_kernel_text && pge_support)
503 pteval |= PT_GLOBAL;
504
505 if (map_debug && ma != va)
506 dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
507 " pte=0x%" PRIx64 " l=%d\n",
508 (uint64_t)ma, (uint64_t)va, pteval, level);
509
510 #if defined(__xpv)
511 /*
512 * see if we can avoid find_pte() on the hypervisor
513 */
514 if (HYPERVISOR_update_va_mapping(va, pteval,
515 UVMF_INVLPG | UVMF_LOCAL) == 0)
516 return;
517 #endif
518
519 /*
520 * Find the pte that will map this address. This creates any
521 * missing intermediate level page tables
522 */
523 ptep = find_pte(va, NULL, level, 0);
524
525 /*
526 * When paravirtualized, we must use hypervisor calls to modify the
527 * PTE, since paging is active. On real hardware we just write to
528 * the pagetables which aren't in use yet.
529 */
530 #if defined(__xpv)
531 ptep = ptep; /* shut lint up */
532 if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
533 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
534 " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
535 (uint64_t)va, level, (uint64_t)ma, pteval);
536 #else
537 if (va < 1024 * 1024)
538 pteval |= PT_NOCACHE; /* for video RAM */
539 if (pae_support)
540 *ptep = pteval;
541 else
542 *((x86pte32_t *)ptep) = (x86pte32_t)pteval;
543 #endif
544 }
545
546 /*
547 * Add a mapping for the physical page at the given virtual address.
548 */
549 static void
map_pa_at_va(paddr_t pa,native_ptr_t va,uint_t level)550 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
551 {
552 map_ma_at_va(pa_to_ma(pa), va, level);
553 }
554
555 /*
556 * This is called to remove start..end from the
557 * possible range of PCI addresses.
558 */
559 const uint64_t pci_lo_limit = 0x00100000ul;
560 const uint64_t pci_hi_limit = 0xfff00000ul;
561 static void
exclude_from_pci(uint64_t start,uint64_t end)562 exclude_from_pci(uint64_t start, uint64_t end)
563 {
564 int i;
565 int j;
566 struct boot_memlist *ml;
567
568 for (i = 0; i < pcimemlists_used; ++i) {
569 ml = &pcimemlists[i];
570
571 /* delete the entire range? */
572 if (start <= ml->addr && ml->addr + ml->size <= end) {
573 --pcimemlists_used;
574 for (j = i; j < pcimemlists_used; ++j)
575 pcimemlists[j] = pcimemlists[j + 1];
576 --i; /* to revisit the new one at this index */
577 }
578
579 /* split a range? */
580 else if (ml->addr < start && end < ml->addr + ml->size) {
581
582 ++pcimemlists_used;
583 if (pcimemlists_used > MAX_MEMLIST)
584 dboot_panic("too many pcimemlists");
585
586 for (j = pcimemlists_used - 1; j > i; --j)
587 pcimemlists[j] = pcimemlists[j - 1];
588 ml->size = start - ml->addr;
589
590 ++ml;
591 ml->size = (ml->addr + ml->size) - end;
592 ml->addr = end;
593 ++i; /* skip on to next one */
594 }
595
596 /* cut memory off the start? */
597 else if (ml->addr < end && end < ml->addr + ml->size) {
598 ml->size -= end - ml->addr;
599 ml->addr = end;
600 }
601
602 /* cut memory off the end? */
603 else if (ml->addr <= start && start < ml->addr + ml->size) {
604 ml->size = start - ml->addr;
605 }
606 }
607 }
608
609 /*
610 * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
611 * definition in Xen source.
612 */
613 #ifdef __xpv
614 typedef struct {
615 uint32_t base_addr_low;
616 uint32_t base_addr_high;
617 uint32_t length_low;
618 uint32_t length_high;
619 uint32_t type;
620 } mmap_t;
621 #else
622 typedef mb_memory_map_t mmap_t;
623 #endif
624
625 static void
build_pcimemlists(mmap_t * mem,int num)626 build_pcimemlists(mmap_t *mem, int num)
627 {
628 mmap_t *mmap;
629 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
630 uint64_t start;
631 uint64_t end;
632 int i;
633
634 /*
635 * initialize
636 */
637 pcimemlists[0].addr = pci_lo_limit;
638 pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
639 pcimemlists_used = 1;
640
641 /*
642 * Fill in PCI memlists.
643 */
644 for (mmap = mem, i = 0; i < num; ++i, ++mmap) {
645 start = ((uint64_t)mmap->base_addr_high << 32) +
646 mmap->base_addr_low;
647 end = start + ((uint64_t)mmap->length_high << 32) +
648 mmap->length_low;
649
650 if (prom_debug)
651 dboot_printf("\ttype: %d %" PRIx64 "..%"
652 PRIx64 "\n", mmap->type, start, end);
653
654 /*
655 * page align start and end
656 */
657 start = (start + page_offset) & ~page_offset;
658 end &= ~page_offset;
659 if (end <= start)
660 continue;
661
662 exclude_from_pci(start, end);
663 }
664
665 /*
666 * Finish off the pcimemlist
667 */
668 if (prom_debug) {
669 for (i = 0; i < pcimemlists_used; ++i) {
670 dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
671 PRIx64 "\n", pcimemlists[i].addr,
672 pcimemlists[i].addr + pcimemlists[i].size);
673 }
674 }
675 pcimemlists[0].next = 0;
676 pcimemlists[0].prev = 0;
677 for (i = 1; i < pcimemlists_used; ++i) {
678 pcimemlists[i].prev =
679 (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
680 pcimemlists[i].next = 0;
681 pcimemlists[i - 1].next =
682 (native_ptr_t)(uintptr_t)(pcimemlists + i);
683 }
684 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
685 DBG(bi->bi_pcimem);
686 }
687
688 #if defined(__xpv)
689 /*
690 * Initialize memory allocator stuff from hypervisor-supplied start info.
691 *
692 * There is 512KB of scratch area after the boot stack page.
693 * We'll use that for everything except the kernel nucleus pages which are too
694 * big to fit there and are allocated last anyway.
695 */
696 #define MAXMAPS 100
697 static mmap_t map_buffer[MAXMAPS];
698 static void
init_mem_alloc(void)699 init_mem_alloc(void)
700 {
701 int local; /* variables needed to find start region */
702 paddr_t scratch_start;
703 xen_memory_map_t map;
704
705 DBG_MSG("Entered init_mem_alloc()\n");
706
707 /*
708 * Free memory follows the stack. There's at least 512KB of scratch
709 * space, rounded up to at least 2Mb alignment. That should be enough
710 * for the page tables we'll need to build. The nucleus memory is
711 * allocated last and will be outside the addressible range. We'll
712 * switch to new page tables before we unpack the kernel
713 */
714 scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
715 DBG(scratch_start);
716 scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
717 DBG(scratch_end);
718
719 /*
720 * For paranoia, leave some space between hypervisor data and ours.
721 * Use 500 instead of 512.
722 */
723 next_avail_addr = scratch_end - 500 * 1024;
724 DBG(next_avail_addr);
725
726 /*
727 * The domain builder gives us at most 1 module
728 */
729 DBG(xen_info->mod_len);
730 if (xen_info->mod_len > 0) {
731 DBG(xen_info->mod_start);
732 modules[0].bm_addr = xen_info->mod_start;
733 modules[0].bm_size = xen_info->mod_len;
734 bi->bi_module_cnt = 1;
735 bi->bi_modules = (native_ptr_t)modules;
736 } else {
737 bi->bi_module_cnt = 0;
738 bi->bi_modules = NULL;
739 }
740 DBG(bi->bi_module_cnt);
741 DBG(bi->bi_modules);
742
743 DBG(xen_info->mfn_list);
744 DBG(xen_info->nr_pages);
745 max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
746 DBG(max_mem);
747
748 /*
749 * Using pseudo-physical addresses, so only 1 memlist element
750 */
751 memlists[0].addr = 0;
752 DBG(memlists[0].addr);
753 memlists[0].size = max_mem;
754 DBG(memlists[0].size);
755 memlists_used = 1;
756 DBG(memlists_used);
757
758 /*
759 * finish building physinstall list
760 */
761 sort_physinstall();
762
763 /*
764 * build bios reserved memlists
765 */
766 build_rsvdmemlists();
767
768 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
769 /*
770 * build PCI Memory list
771 */
772 map.nr_entries = MAXMAPS;
773 /*LINTED: constant in conditional context*/
774 set_xen_guest_handle(map.buffer, map_buffer);
775 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
776 dboot_panic("getting XENMEM_machine_memory_map failed");
777 build_pcimemlists(map_buffer, map.nr_entries);
778 }
779 }
780
781 #else /* !__xpv */
782
783 static uint8_t
dboot_a2h(char v)784 dboot_a2h(char v)
785 {
786 if (v >= 'a')
787 return (v - 'a' + 0xa);
788 else if (v >= 'A')
789 return (v - 'A' + 0xa);
790 else if (v >= '0')
791 return (v - '0');
792 else
793 dboot_panic("bad ASCII hex character %c\n", v);
794
795 return (0);
796 }
797
798 static void
digest_a2h(const char * ascii,uint8_t * digest)799 digest_a2h(const char *ascii, uint8_t *digest)
800 {
801 unsigned int i;
802
803 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
804 digest[i] = dboot_a2h(ascii[i * 2]) << 4;
805 digest[i] |= dboot_a2h(ascii[i * 2 + 1]);
806 }
807 }
808
809 /*
810 * Generate a SHA-1 hash of the first len bytes of image, and compare it with
811 * the ASCII-format hash found in the 40-byte buffer at ascii. If they
812 * match, return 0, otherwise -1. This works only for images smaller than
813 * 4 GB, which should not be a problem.
814 */
815 static int
check_image_hash(uint_t midx)816 check_image_hash(uint_t midx)
817 {
818 const char *ascii;
819 const void *image;
820 size_t len;
821 SHA1_CTX ctx;
822 uint8_t digest[SHA1_DIGEST_LENGTH];
823 uint8_t baseline[SHA1_DIGEST_LENGTH];
824 unsigned int i;
825
826 ascii = (const char *)(uintptr_t)modules[midx].bm_hash;
827 image = (const void *)(uintptr_t)modules[midx].bm_addr;
828 len = (size_t)modules[midx].bm_size;
829
830 digest_a2h(ascii, baseline);
831
832 SHA1Init(&ctx);
833 SHA1Update(&ctx, image, len);
834 SHA1Final(digest, &ctx);
835
836 for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
837 if (digest[i] != baseline[i])
838 return (-1);
839 }
840
841 return (0);
842 }
843
844 static const char *
type_to_str(boot_module_type_t type)845 type_to_str(boot_module_type_t type)
846 {
847 switch (type) {
848 case BMT_ROOTFS:
849 return ("rootfs");
850 case BMT_FILE:
851 return ("file");
852 case BMT_HASH:
853 return ("hash");
854 default:
855 return ("unknown");
856 }
857 }
858
859 static void
check_images(void)860 check_images(void)
861 {
862 uint_t i;
863 char displayhash[SHA1_ASCII_LENGTH + 1];
864
865 for (i = 0; i < modules_used; i++) {
866 if (prom_debug) {
867 dboot_printf("module #%d: name %s type %s "
868 "addr %lx size %lx\n",
869 i, (char *)(uintptr_t)modules[i].bm_name,
870 type_to_str(modules[i].bm_type),
871 (ulong_t)modules[i].bm_addr,
872 (ulong_t)modules[i].bm_size);
873 }
874
875 if (modules[i].bm_type == BMT_HASH ||
876 modules[i].bm_hash == NULL) {
877 DBG_MSG("module has no hash; skipping check\n");
878 continue;
879 }
880 (void) memcpy(displayhash,
881 (void *)(uintptr_t)modules[i].bm_hash,
882 SHA1_ASCII_LENGTH);
883 displayhash[SHA1_ASCII_LENGTH] = '\0';
884 if (prom_debug) {
885 dboot_printf("checking expected hash [%s]: ",
886 displayhash);
887 }
888
889 if (check_image_hash(i) != 0)
890 dboot_panic("hash mismatch!\n");
891 else
892 DBG_MSG("OK\n");
893 }
894 }
895
896 /*
897 * Determine the module's starting address, size, name, and type, and fill the
898 * boot_modules structure. This structure is used by the bop code, except for
899 * hashes which are checked prior to transferring control to the kernel.
900 */
901 static void
process_module(mb_module_t * mod)902 process_module(mb_module_t *mod)
903 {
904 int midx = modules_used++;
905 char *p, *q;
906
907 if (prom_debug) {
908 dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n",
909 midx, (char *)(mod->mod_name),
910 (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
911 }
912
913 if (mod->mod_start > mod->mod_end) {
914 dboot_panic("module #%d: module start address 0x%lx greater "
915 "than end address 0x%lx", midx,
916 (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
917 }
918
919 /*
920 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
921 * the address of the last valid byte in a module plus 1 as mod_end.
922 * This is of course a bug; the multiboot specification simply states
923 * that mod_start and mod_end "contain the start and end addresses of
924 * the boot module itself" which is pretty obviously not what GRUB is
925 * doing. However, fixing it requires that not only this code be
926 * changed but also that other code consuming this value and values
927 * derived from it be fixed, and that the kernel and GRUB must either
928 * both have the bug or neither. While there are a lot of combinations
929 * that will work, there are also some that won't, so for simplicity
930 * we'll just cope with the bug. That means we won't actually hash the
931 * byte at mod_end, and we will expect that mod_end for the hash file
932 * itself is one greater than some multiple of 41 (40 bytes of ASCII
933 * hash plus a newline for each module). We set bm_size to the true
934 * correct number of bytes in each module, achieving exactly this.
935 */
936
937 modules[midx].bm_addr = mod->mod_start;
938 modules[midx].bm_size = mod->mod_end - mod->mod_start;
939 modules[midx].bm_name = mod->mod_name;
940 modules[midx].bm_hash = NULL;
941 modules[midx].bm_type = BMT_FILE;
942
943 if (mod->mod_name == NULL) {
944 modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname;
945 return;
946 }
947
948 p = (char *)(uintptr_t)mod->mod_name;
949 modules[midx].bm_name =
950 (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r");
951
952 while (p != NULL) {
953 q = strsep(&p, " \t\f\n\r");
954 if (strncmp(q, "name=", 5) == 0) {
955 if (q[5] != '\0' && !isspace(q[5])) {
956 modules[midx].bm_name =
957 (native_ptr_t)(uintptr_t)(q + 5);
958 }
959 continue;
960 }
961
962 if (strncmp(q, "type=", 5) == 0) {
963 if (q[5] == '\0' || isspace(q[5]))
964 continue;
965 q += 5;
966 if (strcmp(q, "rootfs") == 0) {
967 modules[midx].bm_type = BMT_ROOTFS;
968 } else if (strcmp(q, "hash") == 0) {
969 modules[midx].bm_type = BMT_HASH;
970 } else if (strcmp(q, "file") != 0) {
971 dboot_printf("\tmodule #%d: unknown module "
972 "type '%s'; defaulting to 'file'",
973 midx, q);
974 }
975 continue;
976 }
977
978 if (strncmp(q, "hash=", 5) == 0) {
979 if (q[5] != '\0' && !isspace(q[5])) {
980 modules[midx].bm_hash =
981 (native_ptr_t)(uintptr_t)(q + 5);
982 }
983 continue;
984 }
985
986 dboot_printf("ignoring unknown option '%s'\n", q);
987 }
988 }
989
990 /*
991 * Backward compatibility: if there are exactly one or two modules, both
992 * of type 'file' and neither with an embedded hash value, we have been
993 * given the legacy style modules. In this case we need to treat the first
994 * module as a rootfs and the second as a hash referencing that module.
995 * Otherwise, even if the configuration is invalid, we assume that the
996 * operator knows what he's doing or at least isn't being bitten by this
997 * interface change.
998 */
999 static void
fixup_modules(void)1000 fixup_modules(void)
1001 {
1002 if (modules_used == 0 || modules_used > 2)
1003 return;
1004
1005 if (modules[0].bm_type != BMT_FILE ||
1006 modules_used > 1 && modules[1].bm_type != BMT_FILE) {
1007 return;
1008 }
1009
1010 if (modules[0].bm_hash != NULL ||
1011 modules_used > 1 && modules[1].bm_hash != NULL) {
1012 return;
1013 }
1014
1015 modules[0].bm_type = BMT_ROOTFS;
1016 if (modules_used > 1) {
1017 modules[1].bm_type = BMT_HASH;
1018 modules[1].bm_name = modules[0].bm_name;
1019 }
1020 }
1021
1022 /*
1023 * For modules that do not have assigned hashes but have a separate hash module,
1024 * find the assigned hash module and set the primary module's bm_hash to point
1025 * to the hash data from that module. We will then ignore modules of type
1026 * BMT_HASH from this point forward.
1027 */
1028 static void
assign_module_hashes(void)1029 assign_module_hashes(void)
1030 {
1031 uint_t i, j;
1032
1033 for (i = 0; i < modules_used; i++) {
1034 if (modules[i].bm_type == BMT_HASH ||
1035 modules[i].bm_hash != NULL) {
1036 continue;
1037 }
1038
1039 for (j = 0; j < modules_used; j++) {
1040 if (modules[j].bm_type != BMT_HASH ||
1041 strcmp((char *)(uintptr_t)modules[j].bm_name,
1042 (char *)(uintptr_t)modules[i].bm_name) != 0) {
1043 continue;
1044 }
1045
1046 if (modules[j].bm_size < SHA1_ASCII_LENGTH) {
1047 dboot_printf("Short hash module of length "
1048 "0x%lx bytes; ignoring\n",
1049 (ulong_t)modules[j].bm_size);
1050 } else {
1051 modules[i].bm_hash = modules[j].bm_addr;
1052 }
1053 break;
1054 }
1055 }
1056 }
1057
1058 /*
1059 * During memory allocation, find the highest address not used yet.
1060 */
1061 static void
check_higher(paddr_t a)1062 check_higher(paddr_t a)
1063 {
1064 if (a < next_avail_addr)
1065 return;
1066 next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
1067 DBG(next_avail_addr);
1068 }
1069
1070 /*
1071 * Walk through the module information finding the last used address.
1072 * The first available address will become the top level page table.
1073 *
1074 * We then build the phys_install memlist from the multiboot information.
1075 */
1076 static void
init_mem_alloc(void)1077 init_mem_alloc(void)
1078 {
1079 mb_memory_map_t *mmap;
1080 mb_module_t *mod;
1081 uint64_t start;
1082 uint64_t end;
1083 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
1084 extern char _end[];
1085 int i;
1086
1087 DBG_MSG("Entered init_mem_alloc()\n");
1088 DBG((uintptr_t)mb_info);
1089
1090 if (mb_info->mods_count > MAX_BOOT_MODULES) {
1091 dboot_panic("Too many modules (%d) -- the maximum is %d.",
1092 mb_info->mods_count, MAX_BOOT_MODULES);
1093 }
1094 /*
1095 * search the modules to find the last used address
1096 * we'll build the module list while we're walking through here
1097 */
1098 DBG_MSG("\nFinding Modules\n");
1099 check_higher((paddr_t)(uintptr_t)&_end);
1100 for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
1101 i < mb_info->mods_count;
1102 ++mod, ++i) {
1103 process_module(mod);
1104 check_higher(mod->mod_end);
1105 }
1106 bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1107 DBG(bi->bi_modules);
1108 bi->bi_module_cnt = mb_info->mods_count;
1109 DBG(bi->bi_module_cnt);
1110
1111 fixup_modules();
1112 assign_module_hashes();
1113 check_images();
1114
1115 /*
1116 * Walk through the memory map from multiboot and build our memlist
1117 * structures. Note these will have native format pointers.
1118 */
1119 DBG_MSG("\nFinding Memory Map\n");
1120 DBG(mb_info->flags);
1121 max_mem = 0;
1122 if (mb_info->flags & 0x40) {
1123 int cnt = 0;
1124
1125 DBG(mb_info->mmap_addr);
1126 DBG(mb_info->mmap_length);
1127 check_higher(mb_info->mmap_addr + mb_info->mmap_length);
1128
1129 for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
1130 (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length;
1131 mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size
1132 + sizeof (mmap->size))) {
1133 ++cnt;
1134 start = ((uint64_t)mmap->base_addr_high << 32) +
1135 mmap->base_addr_low;
1136 end = start + ((uint64_t)mmap->length_high << 32) +
1137 mmap->length_low;
1138
1139 if (prom_debug)
1140 dboot_printf("\ttype: %d %" PRIx64 "..%"
1141 PRIx64 "\n", mmap->type, start, end);
1142
1143 /*
1144 * page align start and end
1145 */
1146 start = (start + page_offset) & ~page_offset;
1147 end &= ~page_offset;
1148 if (end <= start)
1149 continue;
1150
1151 /*
1152 * only type 1 is usable RAM
1153 */
1154 switch (mmap->type) {
1155 case 1:
1156 if (end > max_mem)
1157 max_mem = end;
1158 memlists[memlists_used].addr = start;
1159 memlists[memlists_used].size = end - start;
1160 ++memlists_used;
1161 if (memlists_used > MAX_MEMLIST)
1162 dboot_panic("too many memlists");
1163 break;
1164 case 2:
1165 rsvdmemlists[rsvdmemlists_used].addr = start;
1166 rsvdmemlists[rsvdmemlists_used].size =
1167 end - start;
1168 ++rsvdmemlists_used;
1169 if (rsvdmemlists_used > MAX_MEMLIST)
1170 dboot_panic("too many rsvdmemlists");
1171 break;
1172 default:
1173 continue;
1174 }
1175 }
1176 build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt);
1177 } else if (mb_info->flags & 0x01) {
1178 DBG(mb_info->mem_lower);
1179 memlists[memlists_used].addr = 0;
1180 memlists[memlists_used].size = mb_info->mem_lower * 1024;
1181 ++memlists_used;
1182 DBG(mb_info->mem_upper);
1183 memlists[memlists_used].addr = 1024 * 1024;
1184 memlists[memlists_used].size = mb_info->mem_upper * 1024;
1185 ++memlists_used;
1186
1187 /*
1188 * Old platform - assume I/O space at the end of memory.
1189 */
1190 pcimemlists[0].addr =
1191 (mb_info->mem_upper * 1024) + (1024 * 1024);
1192 pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
1193 pcimemlists[0].next = 0;
1194 pcimemlists[0].prev = 0;
1195 bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
1196 DBG(bi->bi_pcimem);
1197 } else {
1198 dboot_panic("No memory info from boot loader!!!");
1199 }
1200
1201 check_higher(bi->bi_cmdline);
1202
1203 /*
1204 * finish processing the physinstall list
1205 */
1206 sort_physinstall();
1207
1208 /*
1209 * build bios reserved mem lists
1210 */
1211 build_rsvdmemlists();
1212 }
1213 #endif /* !__xpv */
1214
1215 /*
1216 * Simple memory allocator, allocates aligned physical memory.
1217 * Note that startup_kernel() only allocates memory, never frees.
1218 * Memory usage just grows in an upward direction.
1219 */
1220 static void *
do_mem_alloc(uint32_t size,uint32_t align)1221 do_mem_alloc(uint32_t size, uint32_t align)
1222 {
1223 uint_t i;
1224 uint64_t best;
1225 uint64_t start;
1226 uint64_t end;
1227
1228 /*
1229 * make sure size is a multiple of pagesize
1230 */
1231 size = RNDUP(size, MMU_PAGESIZE);
1232 next_avail_addr = RNDUP(next_avail_addr, align);
1233
1234 /*
1235 * XXPV fixme joe
1236 *
1237 * a really large bootarchive that causes you to run out of memory
1238 * may cause this to blow up
1239 */
1240 /* LINTED E_UNEXPECTED_UINT_PROMOTION */
1241 best = (uint64_t)-size;
1242 for (i = 0; i < memlists_used; ++i) {
1243 start = memlists[i].addr;
1244 #if defined(__xpv)
1245 start += mfn_base;
1246 #endif
1247 end = start + memlists[i].size;
1248
1249 /*
1250 * did we find the desired address?
1251 */
1252 if (start <= next_avail_addr && next_avail_addr + size <= end) {
1253 best = next_avail_addr;
1254 goto done;
1255 }
1256
1257 /*
1258 * if not is this address the best so far?
1259 */
1260 if (start > next_avail_addr && start < best &&
1261 RNDUP(start, align) + size <= end)
1262 best = RNDUP(start, align);
1263 }
1264
1265 /*
1266 * We didn't find exactly the address we wanted, due to going off the
1267 * end of a memory region. Return the best found memory address.
1268 */
1269 done:
1270 next_avail_addr = best + size;
1271 #if defined(__xpv)
1272 if (next_avail_addr > scratch_end)
1273 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
1274 "0x%lx", (ulong_t)next_avail_addr,
1275 (ulong_t)scratch_end);
1276 #endif
1277 (void) memset((void *)(uintptr_t)best, 0, size);
1278 return ((void *)(uintptr_t)best);
1279 }
1280
1281 void *
mem_alloc(uint32_t size)1282 mem_alloc(uint32_t size)
1283 {
1284 return (do_mem_alloc(size, MMU_PAGESIZE));
1285 }
1286
1287
1288 /*
1289 * Build page tables to map all of memory used so far as well as the kernel.
1290 */
1291 static void
build_page_tables(void)1292 build_page_tables(void)
1293 {
1294 uint32_t psize;
1295 uint32_t level;
1296 uint32_t off;
1297 uint64_t start;
1298 #if !defined(__xpv)
1299 uint32_t i;
1300 uint64_t end;
1301 #endif /* __xpv */
1302
1303 /*
1304 * If we're on metal, we need to create the top level pagetable.
1305 */
1306 #if defined(__xpv)
1307 top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1308 #else /* __xpv */
1309 top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1310 #endif /* __xpv */
1311 DBG((uintptr_t)top_page_table);
1312
1313 /*
1314 * Determine if we'll use large mappings for kernel, then map it.
1315 */
1316 if (largepage_support) {
1317 psize = lpagesize;
1318 level = 1;
1319 } else {
1320 psize = MMU_PAGESIZE;
1321 level = 0;
1322 }
1323
1324 DBG_MSG("Mapping kernel\n");
1325 DBG(ktext_phys);
1326 DBG(target_kernel_text);
1327 DBG(ksize);
1328 DBG(psize);
1329 for (off = 0; off < ksize; off += psize)
1330 map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1331
1332 /*
1333 * The kernel will need a 1 page window to work with page tables
1334 */
1335 bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE);
1336 DBG(bi->bi_pt_window);
1337 bi->bi_pte_to_pt_window =
1338 (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1339 DBG(bi->bi_pte_to_pt_window);
1340
1341 #if defined(__xpv)
1342 if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1343 /* If this is a domU we're done. */
1344 DBG_MSG("\nPage tables constructed\n");
1345 return;
1346 }
1347 #endif /* __xpv */
1348
1349 /*
1350 * We need 1:1 mappings for the lower 1M of memory to access
1351 * BIOS tables used by a couple of drivers during boot.
1352 *
1353 * The following code works because our simple memory allocator
1354 * only grows usage in an upwards direction.
1355 *
1356 * Note that by this point in boot some mappings for low memory
1357 * may already exist because we've already accessed device in low
1358 * memory. (Specifically the video frame buffer and keyboard
1359 * status ports.) If we're booting on raw hardware then GRUB
1360 * created these mappings for us. If we're booting under a
1361 * hypervisor then we went ahead and remapped these devices into
1362 * memory allocated within dboot itself.
1363 */
1364 if (map_debug)
1365 dboot_printf("1:1 map pa=0..1Meg\n");
1366 for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
1367 #if defined(__xpv)
1368 map_ma_at_va(start, start, 0);
1369 #else /* __xpv */
1370 map_pa_at_va(start, start, 0);
1371 #endif /* __xpv */
1372 }
1373
1374 #if !defined(__xpv)
1375 for (i = 0; i < memlists_used; ++i) {
1376 start = memlists[i].addr;
1377
1378 end = start + memlists[i].size;
1379
1380 if (map_debug)
1381 dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
1382 start, end);
1383 while (start < end && start < next_avail_addr) {
1384 map_pa_at_va(start, start, 0);
1385 start += MMU_PAGESIZE;
1386 }
1387 }
1388 #endif /* !__xpv */
1389
1390 DBG_MSG("\nPage tables constructed\n");
1391 }
1392
1393 #define NO_MULTIBOOT \
1394 "multiboot is no longer used to boot the Solaris Operating System.\n\
1395 The grub entry should be changed to:\n\
1396 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
1397 module$ /platform/i86pc/$ISADIR/boot_archive\n\
1398 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
1399
1400 /*
1401 * startup_kernel has a pretty simple job. It builds pagetables which reflect
1402 * 1:1 mappings for all memory in use. It then also adds mappings for
1403 * the kernel nucleus at virtual address of target_kernel_text using large page
1404 * mappings. The page table pages are also accessible at 1:1 mapped
1405 * virtual addresses.
1406 */
1407 /*ARGSUSED*/
1408 void
startup_kernel(void)1409 startup_kernel(void)
1410 {
1411 char *cmdline;
1412 uintptr_t addr;
1413 #if defined(__xpv)
1414 physdev_set_iopl_t set_iopl;
1415 #endif /* __xpv */
1416
1417 /*
1418 * At this point we are executing in a 32 bit real mode.
1419 */
1420 #if defined(__xpv)
1421 cmdline = (char *)xen_info->cmd_line;
1422 #else /* __xpv */
1423 cmdline = (char *)mb_info->cmdline;
1424 #endif /* __xpv */
1425
1426 prom_debug = (strstr(cmdline, "prom_debug") != NULL);
1427 map_debug = (strstr(cmdline, "map_debug") != NULL);
1428
1429 #if defined(__xpv)
1430 /*
1431 * For dom0, before we initialize the console subsystem we'll
1432 * need to enable io operations, so set I/O priveldge level to 1.
1433 */
1434 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1435 set_iopl.iopl = 1;
1436 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1437 }
1438 #endif /* __xpv */
1439
1440 bcons_init(cmdline);
1441 DBG_MSG("\n\nSolaris prekernel set: ");
1442 DBG_MSG(cmdline);
1443 DBG_MSG("\n");
1444
1445 if (strstr(cmdline, "multiboot") != NULL) {
1446 dboot_panic(NO_MULTIBOOT);
1447 }
1448
1449 /*
1450 * boot info must be 16 byte aligned for 64 bit kernel ABI
1451 */
1452 addr = (uintptr_t)boot_info;
1453 addr = (addr + 0xf) & ~0xf;
1454 bi = (struct xboot_info *)addr;
1455 DBG((uintptr_t)bi);
1456 bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
1457
1458 /*
1459 * Need correct target_kernel_text value
1460 */
1461 #if defined(_BOOT_TARGET_amd64)
1462 target_kernel_text = KERNEL_TEXT_amd64;
1463 #elif defined(__xpv)
1464 target_kernel_text = KERNEL_TEXT_i386_xpv;
1465 #else
1466 target_kernel_text = KERNEL_TEXT_i386;
1467 #endif
1468 DBG(target_kernel_text);
1469
1470 #if defined(__xpv)
1471
1472 /*
1473 * XXPV Derive this stuff from CPUID / what the hypervisor has enabled
1474 */
1475
1476 #if defined(_BOOT_TARGET_amd64)
1477 /*
1478 * 64-bit hypervisor.
1479 */
1480 amd64_support = 1;
1481 pae_support = 1;
1482
1483 #else /* _BOOT_TARGET_amd64 */
1484
1485 /*
1486 * See if we are running on a PAE Hypervisor
1487 */
1488 {
1489 xen_capabilities_info_t caps;
1490
1491 if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
1492 dboot_panic("HYPERVISOR_xen_version(caps) failed");
1493 caps[sizeof (caps) - 1] = 0;
1494 if (prom_debug)
1495 dboot_printf("xen capabilities %s\n", caps);
1496 if (strstr(caps, "x86_32p") != NULL)
1497 pae_support = 1;
1498 }
1499
1500 #endif /* _BOOT_TARGET_amd64 */
1501 {
1502 xen_platform_parameters_t p;
1503
1504 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
1505 dboot_panic("HYPERVISOR_xen_version(parms) failed");
1506 DBG(p.virt_start);
1507 mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
1508 }
1509
1510 /*
1511 * The hypervisor loads stuff starting at 1Gig
1512 */
1513 mfn_base = ONE_GIG;
1514 DBG(mfn_base);
1515
1516 /*
1517 * enable writable page table mode for the hypervisor
1518 */
1519 if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1520 VMASST_TYPE_writable_pagetables) < 0)
1521 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
1522
1523 /*
1524 * check for NX support
1525 */
1526 if (pae_support) {
1527 uint32_t eax = 0x80000000;
1528 uint32_t edx = get_cpuid_edx(&eax);
1529
1530 if (eax >= 0x80000001) {
1531 eax = 0x80000001;
1532 edx = get_cpuid_edx(&eax);
1533 if (edx & CPUID_AMD_EDX_NX)
1534 NX_support = 1;
1535 }
1536 }
1537
1538 #if !defined(_BOOT_TARGET_amd64)
1539
1540 /*
1541 * The 32-bit hypervisor uses segmentation to protect itself from
1542 * guests. This means when a guest attempts to install a flat 4GB
1543 * code or data descriptor the 32-bit hypervisor will protect itself
1544 * by silently shrinking the segment such that if the guest attempts
1545 * any access where the hypervisor lives a #gp fault is generated.
1546 * The problem is that some applications expect a full 4GB flat
1547 * segment for their current thread pointer and will use negative
1548 * offset segment wrap around to access data. TLS support in linux
1549 * brand is one example of this.
1550 *
1551 * The 32-bit hypervisor can catch the #gp fault in these cases
1552 * and emulate the access without passing the #gp fault to the guest
1553 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
1554 * Seems like this should have been the default.
1555 * Either way, we want the hypervisor -- and not Solaris -- to deal
1556 * to deal with emulating these accesses.
1557 */
1558 if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1559 VMASST_TYPE_4gb_segments) < 0)
1560 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
1561 #endif /* !_BOOT_TARGET_amd64 */
1562
1563 #else /* __xpv */
1564
1565 /*
1566 * use cpuid to enable MMU features
1567 */
1568 if (have_cpuid()) {
1569 uint32_t eax, edx;
1570
1571 eax = 1;
1572 edx = get_cpuid_edx(&eax);
1573 if (edx & CPUID_INTC_EDX_PSE)
1574 largepage_support = 1;
1575 if (edx & CPUID_INTC_EDX_PGE)
1576 pge_support = 1;
1577 if (edx & CPUID_INTC_EDX_PAE)
1578 pae_support = 1;
1579
1580 eax = 0x80000000;
1581 edx = get_cpuid_edx(&eax);
1582 if (eax >= 0x80000001) {
1583 eax = 0x80000001;
1584 edx = get_cpuid_edx(&eax);
1585 if (edx & CPUID_AMD_EDX_LM)
1586 amd64_support = 1;
1587 if (edx & CPUID_AMD_EDX_NX)
1588 NX_support = 1;
1589 }
1590 } else {
1591 dboot_printf("cpuid not supported\n");
1592 }
1593 #endif /* __xpv */
1594
1595
1596 #if defined(_BOOT_TARGET_amd64)
1597 if (amd64_support == 0)
1598 dboot_panic("long mode not supported, rebooting");
1599 else if (pae_support == 0)
1600 dboot_panic("long mode, but no PAE; rebooting");
1601 #else
1602 /*
1603 * Allow the command line to over-ride use of PAE for 32 bit.
1604 */
1605 if (strstr(cmdline, "disablePAE=true") != NULL) {
1606 pae_support = 0;
1607 NX_support = 0;
1608 amd64_support = 0;
1609 }
1610 #endif
1611
1612 /*
1613 * initialize the simple memory allocator
1614 */
1615 init_mem_alloc();
1616
1617 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
1618 /*
1619 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
1620 */
1621 if (max_mem < FOUR_GIG && NX_support == 0)
1622 pae_support = 0;
1623 #endif
1624
1625 /*
1626 * configure mmu information
1627 */
1628 if (pae_support) {
1629 shift_amt = shift_amt_pae;
1630 ptes_per_table = 512;
1631 pte_size = 8;
1632 lpagesize = TWO_MEG;
1633 #if defined(_BOOT_TARGET_amd64)
1634 top_level = 3;
1635 #else
1636 top_level = 2;
1637 #endif
1638 } else {
1639 pae_support = 0;
1640 NX_support = 0;
1641 shift_amt = shift_amt_nopae;
1642 ptes_per_table = 1024;
1643 pte_size = 4;
1644 lpagesize = FOUR_MEG;
1645 top_level = 1;
1646 }
1647
1648 DBG(pge_support);
1649 DBG(NX_support);
1650 DBG(largepage_support);
1651 DBG(amd64_support);
1652 DBG(top_level);
1653 DBG(pte_size);
1654 DBG(ptes_per_table);
1655 DBG(lpagesize);
1656
1657 #if defined(__xpv)
1658 ktext_phys = ONE_GIG; /* from UNIX Mapfile */
1659 #else
1660 ktext_phys = FOUR_MEG; /* from UNIX Mapfile */
1661 #endif
1662
1663 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
1664 /*
1665 * For grub, copy kernel bits from the ELF64 file to final place.
1666 */
1667 DBG_MSG("\nAllocating nucleus pages.\n");
1668 ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
1669 if (ktext_phys == 0)
1670 dboot_panic("failed to allocate aligned kernel memory");
1671 if (dboot_elfload64(mb_header.load_addr) != 0)
1672 dboot_panic("failed to parse kernel ELF image, rebooting");
1673 #endif
1674
1675 DBG(ktext_phys);
1676
1677 /*
1678 * Allocate page tables.
1679 */
1680 build_page_tables();
1681
1682 /*
1683 * return to assembly code to switch to running kernel
1684 */
1685 entry_addr_low = (uint32_t)target_kernel_text;
1686 DBG(entry_addr_low);
1687 bi->bi_use_largepage = largepage_support;
1688 bi->bi_use_pae = pae_support;
1689 bi->bi_use_pge = pge_support;
1690 bi->bi_use_nx = NX_support;
1691
1692 #if defined(__xpv)
1693
1694 bi->bi_next_paddr = next_avail_addr - mfn_base;
1695 DBG(bi->bi_next_paddr);
1696 bi->bi_next_vaddr = (native_ptr_t)next_avail_addr;
1697 DBG(bi->bi_next_vaddr);
1698
1699 /*
1700 * unmap unused pages in start area to make them available for DMA
1701 */
1702 while (next_avail_addr < scratch_end) {
1703 (void) HYPERVISOR_update_va_mapping(next_avail_addr,
1704 0, UVMF_INVLPG | UVMF_LOCAL);
1705 next_avail_addr += MMU_PAGESIZE;
1706 }
1707
1708 bi->bi_xen_start_info = (uintptr_t)xen_info;
1709 DBG((uintptr_t)HYPERVISOR_shared_info);
1710 bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
1711 bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
1712
1713 #else /* __xpv */
1714
1715 bi->bi_next_paddr = next_avail_addr;
1716 DBG(bi->bi_next_paddr);
1717 bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
1718 DBG(bi->bi_next_vaddr);
1719 bi->bi_mb_info = (uintptr_t)mb_info;
1720 bi->bi_top_page_table = (uintptr_t)top_page_table;
1721
1722 #endif /* __xpv */
1723
1724 bi->bi_kseg_size = FOUR_MEG;
1725 DBG(bi->bi_kseg_size);
1726
1727 #ifndef __xpv
1728 if (map_debug)
1729 dump_tables();
1730 #endif
1731
1732 DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
1733 }
1734