1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2012 DEY Storage Systems, Inc. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2013 Joyent, Inc. All rights reserved.
26 */
27 /*
28 * Copyright (c) 2010, Intel Corporation.
29 * All rights reserved.
30 */
31
32 #include <sys/types.h>
33 #include <sys/t_lock.h>
34 #include <sys/param.h>
35 #include <sys/sysmacros.h>
36 #include <sys/signal.h>
37 #include <sys/systm.h>
38 #include <sys/user.h>
39 #include <sys/mman.h>
40 #include <sys/vm.h>
41 #include <sys/conf.h>
42 #include <sys/avintr.h>
43 #include <sys/autoconf.h>
44 #include <sys/disp.h>
45 #include <sys/class.h>
46 #include <sys/bitmap.h>
47
48 #include <sys/privregs.h>
49
50 #include <sys/proc.h>
51 #include <sys/buf.h>
52 #include <sys/kmem.h>
53 #include <sys/mem.h>
54 #include <sys/kstat.h>
55
56 #include <sys/reboot.h>
57
58 #include <sys/cred.h>
59 #include <sys/vnode.h>
60 #include <sys/file.h>
61
62 #include <sys/procfs.h>
63
64 #include <sys/vfs.h>
65 #include <sys/cmn_err.h>
66 #include <sys/utsname.h>
67 #include <sys/debug.h>
68 #include <sys/kdi.h>
69
70 #include <sys/dumphdr.h>
71 #include <sys/bootconf.h>
72 #include <sys/memlist_plat.h>
73 #include <sys/varargs.h>
74 #include <sys/promif.h>
75 #include <sys/modctl.h>
76
77 #include <sys/sunddi.h>
78 #include <sys/sunndi.h>
79 #include <sys/ndi_impldefs.h>
80 #include <sys/ddidmareq.h>
81 #include <sys/psw.h>
82 #include <sys/regset.h>
83 #include <sys/clock.h>
84 #include <sys/pte.h>
85 #include <sys/tss.h>
86 #include <sys/stack.h>
87 #include <sys/trap.h>
88 #include <sys/fp.h>
89 #include <vm/kboot_mmu.h>
90 #include <vm/anon.h>
91 #include <vm/as.h>
92 #include <vm/page.h>
93 #include <vm/seg.h>
94 #include <vm/seg_dev.h>
95 #include <vm/seg_kmem.h>
96 #include <vm/seg_kpm.h>
97 #include <vm/seg_map.h>
98 #include <vm/seg_vn.h>
99 #include <vm/seg_kp.h>
100 #include <sys/memnode.h>
101 #include <vm/vm_dep.h>
102 #include <sys/thread.h>
103 #include <sys/sysconf.h>
104 #include <sys/vm_machparam.h>
105 #include <sys/archsystm.h>
106 #include <sys/machsystm.h>
107 #include <vm/hat.h>
108 #include <vm/hat_i86.h>
109 #include <sys/pmem.h>
110 #include <sys/smp_impldefs.h>
111 #include <sys/x86_archext.h>
112 #include <sys/cpuvar.h>
113 #include <sys/segments.h>
114 #include <sys/clconf.h>
115 #include <sys/kobj.h>
116 #include <sys/kobj_lex.h>
117 #include <sys/cpc_impl.h>
118 #include <sys/cpu_module.h>
119 #include <sys/smbios.h>
120 #include <sys/debug_info.h>
121 #include <sys/bootinfo.h>
122 #include <sys/ddi_periodic.h>
123 #include <sys/systeminfo.h>
124 #include <sys/multiboot.h>
125 #include <sys/ramdisk.h>
126
127 #ifdef __xpv
128
129 #include <sys/hypervisor.h>
130 #include <sys/xen_mmu.h>
131 #include <sys/evtchn_impl.h>
132 #include <sys/gnttab.h>
133 #include <sys/xpv_panic.h>
134 #include <xen/sys/xenbus_comms.h>
135 #include <xen/public/physdev.h>
136
137 extern void xen_late_startup(void);
138
139 struct xen_evt_data cpu0_evt_data;
140
141 #else /* __xpv */
142 #include <sys/memlist_impl.h>
143
144 extern void mem_config_init(void);
145 #endif /* __xpv */
146
147 extern void progressbar_init(void);
148 extern void brand_init(void);
149 extern void pcf_init(void);
150 extern void pg_init(void);
151
152 extern int size_pse_array(pgcnt_t, int);
153
154 #if defined(_SOFT_HOSTID)
155
156 #include <sys/rtc.h>
157
158 static int32_t set_soft_hostid(void);
159 static char hostid_file[] = "/etc/hostid";
160
161 #endif
162
163 void *gfx_devinfo_list;
164
165 #if defined(__amd64) && !defined(__xpv)
166 extern void immu_startup(void);
167 #endif
168
169 /*
170 * XXX make declaration below "static" when drivers no longer use this
171 * interface.
172 */
173 extern caddr_t p0_va; /* Virtual address for accessing physical page 0 */
174
175 /*
176 * segkp
177 */
178 extern int segkp_fromheap;
179
180 static void kvm_init(void);
181 static void startup_init(void);
182 static void startup_memlist(void);
183 static void startup_kmem(void);
184 static void startup_modules(void);
185 static void startup_vm(void);
186 static void startup_end(void);
187 static void layout_kernel_va(void);
188
189 /*
190 * Declare these as initialized data so we can patch them.
191 */
192 #ifdef __i386
193
194 /*
195 * Due to virtual address space limitations running in 32 bit mode, restrict
196 * the amount of physical memory configured to a max of PHYSMEM pages (16g).
197 *
198 * If the physical max memory size of 64g were allowed to be configured, the
199 * size of user virtual address space will be less than 1g. A limited user
200 * address space greatly reduces the range of applications that can run.
201 *
202 * If more physical memory than PHYSMEM is required, users should preferably
203 * run in 64 bit mode which has far looser virtual address space limitations.
204 *
205 * If 64 bit mode is not available (as in IA32) and/or more physical memory
206 * than PHYSMEM is required in 32 bit mode, physmem can be set to the desired
207 * value or to 0 (to configure all available memory) via eeprom(1M). kernelbase
208 * should also be carefully tuned to balance out the need of the user
209 * application while minimizing the risk of kernel heap exhaustion due to
210 * kernelbase being set too high.
211 */
212 #define PHYSMEM 0x400000
213
214 #else /* __amd64 */
215
216 /*
217 * For now we can handle memory with physical addresses up to about
218 * 64 Terabytes. This keeps the kernel above the VA hole, leaving roughly
219 * half the VA space for seg_kpm. When systems get bigger than 64TB this
220 * code will need revisiting. There is an implicit assumption that there
221 * are no *huge* holes in the physical address space too.
222 */
223 #define TERABYTE (1ul << 40)
224 #define PHYSMEM_MAX64 mmu_btop(64 * TERABYTE)
225 #define PHYSMEM PHYSMEM_MAX64
226 #define AMD64_VA_HOLE_END 0xFFFF800000000000ul
227
228 #endif /* __amd64 */
229
230 pgcnt_t physmem = PHYSMEM;
231 pgcnt_t obp_pages; /* Memory used by PROM for its text and data */
232
233 char *kobj_file_buf;
234 int kobj_file_bufsize; /* set in /etc/system */
235
236 /* Global variables for MP support. Used in mp_startup */
237 caddr_t rm_platter_va = 0;
238 uint32_t rm_platter_pa;
239
240 int auto_lpg_disable = 1;
241
242 /*
243 * Some CPUs have holes in the middle of the 64-bit virtual address range.
244 */
245 uintptr_t hole_start, hole_end;
246
247 /*
248 * kpm mapping window
249 */
250 caddr_t kpm_vbase;
251 size_t kpm_size;
252 static int kpm_desired;
253 #ifdef __amd64
254 static uintptr_t segkpm_base = (uintptr_t)SEGKPM_BASE;
255 #endif
256
257 /*
258 * Configuration parameters set at boot time.
259 */
260
261 caddr_t econtig; /* end of first block of contiguous kernel */
262
263 struct bootops *bootops = 0; /* passed in from boot */
264 struct bootops **bootopsp;
265 struct boot_syscalls *sysp; /* passed in from boot */
266
267 char bootblock_fstype[16];
268
269 char kern_bootargs[OBP_MAXPATHLEN];
270 char kern_bootfile[OBP_MAXPATHLEN];
271
272 /*
273 * ZFS zio segment. This allows us to exclude large portions of ZFS data that
274 * gets cached in kmem caches on the heap. If this is set to zero, we allocate
275 * zio buffers from their own segment, otherwise they are allocated from the
276 * heap. The optimization of allocating zio buffers from their own segment is
277 * only valid on 64-bit kernels.
278 */
279 #if defined(__amd64)
280 int segzio_fromheap = 0;
281 #else
282 int segzio_fromheap = 1;
283 #endif
284
285 /*
286 * new memory fragmentations are possible in startup() due to BOP_ALLOCs. this
287 * depends on number of BOP_ALLOC calls made and requested size, memory size
288 * combination and whether boot.bin memory needs to be freed.
289 */
290 #define POSS_NEW_FRAGMENTS 12
291
292 /*
293 * VM data structures
294 */
295 long page_hashsz; /* Size of page hash table (power of two) */
296 unsigned int page_hashsz_shift; /* log2(page_hashsz) */
297 struct page *pp_base; /* Base of initial system page struct array */
298 struct page **page_hash; /* Page hash table */
299 pad_mutex_t *pse_mutex; /* Locks protecting pp->p_selock */
300 size_t pse_table_size; /* Number of mutexes in pse_mutex[] */
301 int pse_shift; /* log2(pse_table_size) */
302 struct seg ktextseg; /* Segment used for kernel executable image */
303 struct seg kvalloc; /* Segment used for "valloc" mapping */
304 struct seg kpseg; /* Segment used for pageable kernel virt mem */
305 struct seg kmapseg; /* Segment used for generic kernel mappings */
306 struct seg kdebugseg; /* Segment used for the kernel debugger */
307
308 struct seg *segkmap = &kmapseg; /* Kernel generic mapping segment */
309 static struct seg *segmap = &kmapseg; /* easier to use name for in here */
310
311 struct seg *segkp = &kpseg; /* Pageable kernel virtual memory segment */
312
313 #if defined(__amd64)
314 struct seg kvseg_core; /* Segment used for the core heap */
315 struct seg kpmseg; /* Segment used for physical mapping */
316 struct seg *segkpm = &kpmseg; /* 64bit kernel physical mapping segment */
317 #else
318 struct seg *segkpm = NULL; /* Unused on IA32 */
319 #endif
320
321 caddr_t segkp_base; /* Base address of segkp */
322 caddr_t segzio_base; /* Base address of segzio */
323 #if defined(__amd64)
324 pgcnt_t segkpsize = btop(SEGKPDEFSIZE); /* size of segkp segment in pages */
325 #else
326 pgcnt_t segkpsize = 0;
327 #endif
328 pgcnt_t segziosize = 0; /* size of zio segment in pages */
329
330 /*
331 * A static DR page_t VA map is reserved that can map the page structures
332 * for a domain's entire RA space. The pages that back this space are
333 * dynamically allocated and need not be physically contiguous. The DR
334 * map size is derived from KPM size.
335 * This mechanism isn't used by x86 yet, so just stubs here.
336 */
337 int ppvm_enable = 0; /* Static virtual map for page structs */
338 page_t *ppvm_base = NULL; /* Base of page struct map */
339 pgcnt_t ppvm_size = 0; /* Size of page struct map */
340
341 /*
342 * VA range available to the debugger
343 */
344 const caddr_t kdi_segdebugbase = (const caddr_t)SEGDEBUGBASE;
345 const size_t kdi_segdebugsize = SEGDEBUGSIZE;
346
347 struct memseg *memseg_base;
348 struct vnode unused_pages_vp;
349
350 #define FOURGB 0x100000000LL
351
352 struct memlist *memlist;
353
354 caddr_t s_text; /* start of kernel text segment */
355 caddr_t e_text; /* end of kernel text segment */
356 caddr_t s_data; /* start of kernel data segment */
357 caddr_t e_data; /* end of kernel data segment */
358 caddr_t modtext; /* start of loadable module text reserved */
359 caddr_t e_modtext; /* end of loadable module text reserved */
360 caddr_t moddata; /* start of loadable module data reserved */
361 caddr_t e_moddata; /* end of loadable module data reserved */
362
363 struct memlist *phys_install; /* Total installed physical memory */
364 struct memlist *phys_avail; /* Total available physical memory */
365 struct memlist *bios_rsvd; /* Bios reserved memory */
366
367 /*
368 * kphysm_init returns the number of pages that were processed
369 */
370 static pgcnt_t kphysm_init(page_t *, pgcnt_t);
371
372 #define IO_PROP_SIZE 64 /* device property size */
373
374 /*
375 * a couple useful roundup macros
376 */
377 #define ROUND_UP_PAGE(x) \
378 ((uintptr_t)P2ROUNDUP((uintptr_t)(x), (uintptr_t)MMU_PAGESIZE))
379 #define ROUND_UP_LPAGE(x) \
380 ((uintptr_t)P2ROUNDUP((uintptr_t)(x), mmu.level_size[1]))
381 #define ROUND_UP_4MEG(x) \
382 ((uintptr_t)P2ROUNDUP((uintptr_t)(x), (uintptr_t)FOUR_MEG))
383 #define ROUND_UP_TOPLEVEL(x) \
384 ((uintptr_t)P2ROUNDUP((uintptr_t)(x), mmu.level_size[mmu.max_level]))
385
386 /*
387 * 32-bit Kernel's Virtual memory layout.
388 * +-----------------------+
389 * | |
390 * 0xFFC00000 -|-----------------------|- ARGSBASE
391 * | debugger |
392 * 0xFF800000 -|-----------------------|- SEGDEBUGBASE
393 * | Kernel Data |
394 * 0xFEC00000 -|-----------------------|
395 * | Kernel Text |
396 * 0xFE800000 -|-----------------------|- KERNEL_TEXT (0xFB400000 on Xen)
397 * |--- GDT ---|- GDT page (GDT_VA)
398 * |--- debug info ---|- debug info (DEBUG_INFO_VA)
399 * | |
400 * | page_t structures |
401 * | memsegs, memlists, |
402 * | page hash, etc. |
403 * --- -|-----------------------|- ekernelheap, valloc_base (floating)
404 * | | (segkp is just an arena in the heap)
405 * | |
406 * | kvseg |
407 * | |
408 * | |
409 * --- -|-----------------------|- kernelheap (floating)
410 * | Segkmap |
411 * 0xC3002000 -|-----------------------|- segmap_start (floating)
412 * | Red Zone |
413 * 0xC3000000 -|-----------------------|- kernelbase / userlimit (floating)
414 * | | ||
415 * | Shared objects | \/
416 * | |
417 * : :
418 * | user data |
419 * |-----------------------|
420 * | user text |
421 * 0x08048000 -|-----------------------|
422 * | user stack |
423 * : :
424 * | invalid |
425 * 0x00000000 +-----------------------+
426 *
427 *
428 * 64-bit Kernel's Virtual memory layout. (assuming 64 bit app)
429 * +-----------------------+
430 * | |
431 * 0xFFFFFFFF.FFC00000 |-----------------------|- ARGSBASE
432 * | debugger (?) |
433 * 0xFFFFFFFF.FF800000 |-----------------------|- SEGDEBUGBASE
434 * | unused |
435 * +-----------------------+
436 * | Kernel Data |
437 * 0xFFFFFFFF.FBC00000 |-----------------------|
438 * | Kernel Text |
439 * 0xFFFFFFFF.FB800000 |-----------------------|- KERNEL_TEXT
440 * |--- GDT ---|- GDT page (GDT_VA)
441 * |--- debug info ---|- debug info (DEBUG_INFO_VA)
442 * | |
443 * | Core heap | (used for loadable modules)
444 * 0xFFFFFFFF.C0000000 |-----------------------|- core_base / ekernelheap
445 * | Kernel |
446 * | heap |
447 * 0xFFFFFXXX.XXX00000 |-----------------------|- kernelheap (floating)
448 * | segmap |
449 * 0xFFFFFXXX.XXX00000 |-----------------------|- segmap_start (floating)
450 * | device mappings |
451 * 0xFFFFFXXX.XXX00000 |-----------------------|- toxic_addr (floating)
452 * | segzio |
453 * 0xFFFFFXXX.XXX00000 |-----------------------|- segzio_base (floating)
454 * | segkp |
455 * --- |-----------------------|- segkp_base (floating)
456 * | page_t structures | valloc_base + valloc_sz
457 * | memsegs, memlists, |
458 * | page hash, etc. |
459 * 0xFFFFFF00.00000000 |-----------------------|- valloc_base (lower if > 1TB)
460 * | segkpm |
461 * 0xFFFFFE00.00000000 |-----------------------|
462 * | Red Zone |
463 * 0xFFFFFD80.00000000 |-----------------------|- KERNELBASE (lower if > 1TB)
464 * | User stack |- User space memory
465 * | |
466 * | shared objects, etc | (grows downwards)
467 * : :
468 * | |
469 * 0xFFFF8000.00000000 |-----------------------|
470 * | |
471 * | VA Hole / unused |
472 * | |
473 * 0x00008000.00000000 |-----------------------|
474 * | |
475 * | |
476 * : :
477 * | user heap | (grows upwards)
478 * | |
479 * | user data |
480 * |-----------------------|
481 * | user text |
482 * 0x00000000.04000000 |-----------------------|
483 * | invalid |
484 * 0x00000000.00000000 +-----------------------+
485 *
486 * A 32 bit app on the 64 bit kernel sees the same layout as on the 32 bit
487 * kernel, except that userlimit is raised to 0xfe000000
488 *
489 * Floating values:
490 *
491 * valloc_base: start of the kernel's memory management/tracking data
492 * structures. This region contains page_t structures for
493 * physical memory, memsegs, memlists, and the page hash.
494 *
495 * core_base: start of the kernel's "core" heap area on 64-bit systems.
496 * This area is intended to be used for global data as well as for module
497 * text/data that does not fit into the nucleus pages. The core heap is
498 * restricted to a 2GB range, allowing every address within it to be
499 * accessed using rip-relative addressing
500 *
501 * ekernelheap: end of kernelheap and start of segmap.
502 *
503 * kernelheap: start of kernel heap. On 32-bit systems, this starts right
504 * above a red zone that separates the user's address space from the
505 * kernel's. On 64-bit systems, it sits above segkp and segkpm.
506 *
507 * segmap_start: start of segmap. The length of segmap can be modified
508 * through eeprom. The default length is 16MB on 32-bit systems and 64MB
509 * on 64-bit systems.
510 *
511 * kernelbase: On a 32-bit kernel the default value of 0xd4000000 will be
512 * decreased by 2X the size required for page_t. This allows the kernel
513 * heap to grow in size with physical memory. With sizeof(page_t) == 80
514 * bytes, the following shows the values of kernelbase and kernel heap
515 * sizes for different memory configurations (assuming default segmap and
516 * segkp sizes).
517 *
518 * mem size for kernelbase kernel heap
519 * size page_t's size
520 * ---- --------- ---------- -----------
521 * 1gb 0x01400000 0xd1800000 684MB
522 * 2gb 0x02800000 0xcf000000 704MB
523 * 4gb 0x05000000 0xca000000 744MB
524 * 6gb 0x07800000 0xc5000000 784MB
525 * 8gb 0x0a000000 0xc0000000 824MB
526 * 16gb 0x14000000 0xac000000 984MB
527 * 32gb 0x28000000 0x84000000 1304MB
528 * 64gb 0x50000000 0x34000000 1944MB (*)
529 *
530 * kernelbase is less than the abi minimum of 0xc0000000 for memory
531 * configurations above 8gb.
532 *
533 * (*) support for memory configurations above 32gb will require manual tuning
534 * of kernelbase to balance out the need of user applications.
535 */
536
537 /* real-time-clock initialization parameters */
538 extern time_t process_rtc_config_file(void);
539
540 uintptr_t kernelbase;
541 uintptr_t postbootkernelbase; /* not set till boot loader is gone */
542 uintptr_t eprom_kernelbase;
543 size_t segmapsize;
544 uintptr_t segmap_start;
545 int segmapfreelists;
546 pgcnt_t npages;
547 pgcnt_t orig_npages;
548 size_t core_size; /* size of "core" heap */
549 uintptr_t core_base; /* base address of "core" heap */
550
551 /*
552 * List of bootstrap pages. We mark these as allocated in startup.
553 * release_bootstrap() will free them when we're completely done with
554 * the bootstrap.
555 */
556 static page_t *bootpages;
557
558 /*
559 * boot time pages that have a vnode from the ramdisk will keep that forever.
560 */
561 static page_t *rd_pages;
562
563 /*
564 * Lower 64K
565 */
566 static page_t *lower_pages = NULL;
567 static int lower_pages_count = 0;
568
569 struct system_hardware system_hardware;
570
571 /*
572 * Enable some debugging messages concerning memory usage...
573 */
574 static void
print_memlist(char * title,struct memlist * mp)575 print_memlist(char *title, struct memlist *mp)
576 {
577 prom_printf("MEMLIST: %s:\n", title);
578 while (mp != NULL) {
579 prom_printf("\tAddress 0x%" PRIx64 ", size 0x%" PRIx64 "\n",
580 mp->ml_address, mp->ml_size);
581 mp = mp->ml_next;
582 }
583 }
584
585 /*
586 * XX64 need a comment here.. are these just default values, surely
587 * we read the "cpuid" type information to figure this out.
588 */
589 int l2cache_sz = 0x80000;
590 int l2cache_linesz = 0x40;
591 int l2cache_assoc = 1;
592
593 static size_t textrepl_min_gb = 10;
594
595 /*
596 * on 64 bit we use a predifined VA range for mapping devices in the kernel
597 * on 32 bit the mappings are intermixed in the heap, so we use a bit map
598 */
599 #ifdef __amd64
600
601 vmem_t *device_arena;
602 uintptr_t toxic_addr = (uintptr_t)NULL;
603 size_t toxic_size = 1024 * 1024 * 1024; /* Sparc uses 1 gig too */
604
605 #else /* __i386 */
606
607 ulong_t *toxic_bit_map; /* one bit for each 4k of VA in heap_arena */
608 size_t toxic_bit_map_len = 0; /* in bits */
609
610 #endif /* __i386 */
611
612 /*
613 * Simple boot time debug facilities
614 */
615 static char *prm_dbg_str[] = {
616 "%s:%d: '%s' is 0x%x\n",
617 "%s:%d: '%s' is 0x%llx\n"
618 };
619
620 int prom_debug;
621
622 #define PRM_DEBUG(q) if (prom_debug) \
623 prom_printf(prm_dbg_str[sizeof (q) >> 3], "startup.c", __LINE__, #q, q);
624 #define PRM_POINT(q) if (prom_debug) \
625 prom_printf("%s:%d: %s\n", "startup.c", __LINE__, q);
626
627 /*
628 * This structure is used to keep track of the intial allocations
629 * done in startup_memlist(). The value of NUM_ALLOCATIONS needs to
630 * be >= the number of ADD_TO_ALLOCATIONS() executed in the code.
631 */
632 #define NUM_ALLOCATIONS 8
633 int num_allocations = 0;
634 struct {
635 void **al_ptr;
636 size_t al_size;
637 } allocations[NUM_ALLOCATIONS];
638 size_t valloc_sz = 0;
639 uintptr_t valloc_base;
640
641 #define ADD_TO_ALLOCATIONS(ptr, size) { \
642 size = ROUND_UP_PAGE(size); \
643 if (num_allocations == NUM_ALLOCATIONS) \
644 panic("too many ADD_TO_ALLOCATIONS()"); \
645 allocations[num_allocations].al_ptr = (void**)&ptr; \
646 allocations[num_allocations].al_size = size; \
647 valloc_sz += size; \
648 ++num_allocations; \
649 }
650
651 /*
652 * Allocate all the initial memory needed by the page allocator.
653 */
654 static void
perform_allocations(void)655 perform_allocations(void)
656 {
657 caddr_t mem;
658 int i;
659 int valloc_align;
660
661 PRM_DEBUG(valloc_base);
662 PRM_DEBUG(valloc_sz);
663 valloc_align = mmu.level_size[mmu.max_page_level > 0];
664 mem = BOP_ALLOC(bootops, (caddr_t)valloc_base, valloc_sz, valloc_align);
665 if (mem != (caddr_t)valloc_base)
666 panic("BOP_ALLOC() failed");
667 bzero(mem, valloc_sz);
668 for (i = 0; i < num_allocations; ++i) {
669 *allocations[i].al_ptr = (void *)mem;
670 mem += allocations[i].al_size;
671 }
672 }
673
674 /*
675 * Our world looks like this at startup time.
676 *
677 * In a 32-bit OS, boot loads the kernel text at 0xfe800000 and kernel data
678 * at 0xfec00000. On a 64-bit OS, kernel text and data are loaded at
679 * 0xffffffff.fe800000 and 0xffffffff.fec00000 respectively. Those
680 * addresses are fixed in the binary at link time.
681 *
682 * On the text page:
683 * unix/genunix/krtld/module text loads.
684 *
685 * On the data page:
686 * unix/genunix/krtld/module data loads.
687 *
688 * Machine-dependent startup code
689 */
690 void
startup(void)691 startup(void)
692 {
693 #if !defined(__xpv)
694 extern void startup_pci_bios(void);
695 #endif
696 extern cpuset_t cpu_ready_set;
697
698 /*
699 * Make sure that nobody tries to use sekpm until we have
700 * initialized it properly.
701 */
702 #if defined(__amd64)
703 kpm_desired = 1;
704 #endif
705 kpm_enable = 0;
706 CPUSET_ONLY(cpu_ready_set, 0); /* cpu 0 is boot cpu */
707
708 #if defined(__xpv) /* XXPV fix me! */
709 {
710 extern int segvn_use_regions;
711 segvn_use_regions = 0;
712 }
713 #endif
714 progressbar_init();
715 startup_init();
716 #if defined(__xpv)
717 startup_xen_version();
718 #endif
719 startup_memlist();
720 startup_kmem();
721 startup_vm();
722 #if !defined(__xpv)
723 /*
724 * Note we need to do this even on fast reboot in order to access
725 * the irq routing table (used for pci labels).
726 */
727 startup_pci_bios();
728 #endif
729 #if defined(__xpv)
730 startup_xen_mca();
731 #endif
732 startup_modules();
733
734 startup_end();
735 }
736
737 static void
startup_init()738 startup_init()
739 {
740 PRM_POINT("startup_init() starting...");
741
742 /*
743 * Complete the extraction of cpuid data
744 */
745 cpuid_pass2(CPU);
746
747 (void) check_boot_version(BOP_GETVERSION(bootops));
748
749 /*
750 * Check for prom_debug in boot environment
751 */
752 if (BOP_GETPROPLEN(bootops, "prom_debug") >= 0) {
753 ++prom_debug;
754 PRM_POINT("prom_debug found in boot enviroment");
755 }
756
757 /*
758 * Collect node, cpu and memory configuration information.
759 */
760 get_system_configuration();
761
762 /*
763 * Halt if this is an unsupported processor.
764 */
765 if (x86_type == X86_TYPE_486 || x86_type == X86_TYPE_CYRIX_486) {
766 printf("\n486 processor (\"%s\") detected.\n",
767 CPU->cpu_brandstr);
768 halt("This processor is not supported by this release "
769 "of Solaris.");
770 }
771
772 PRM_POINT("startup_init() done");
773 }
774
775 /*
776 * Callback for copy_memlist_filter() to filter nucleus, kadb/kmdb, (ie.
777 * everything mapped above KERNEL_TEXT) pages from phys_avail. Note it
778 * also filters out physical page zero. There is some reliance on the
779 * boot loader allocating only a few contiguous physical memory chunks.
780 */
781 static void
avail_filter(uint64_t * addr,uint64_t * size)782 avail_filter(uint64_t *addr, uint64_t *size)
783 {
784 uintptr_t va;
785 uintptr_t next_va;
786 pfn_t pfn;
787 uint64_t pfn_addr;
788 uint64_t pfn_eaddr;
789 uint_t prot;
790 size_t len;
791 uint_t change;
792
793 if (prom_debug)
794 prom_printf("\tFilter: in: a=%" PRIx64 ", s=%" PRIx64 "\n",
795 *addr, *size);
796
797 /*
798 * page zero is required for BIOS.. never make it available
799 */
800 if (*addr == 0) {
801 *addr += MMU_PAGESIZE;
802 *size -= MMU_PAGESIZE;
803 }
804
805 /*
806 * First we trim from the front of the range. Since kbm_probe()
807 * walks ranges in virtual order, but addr/size are physical, we need
808 * to the list until no changes are seen. This deals with the case
809 * where page "p" is mapped at v, page "p + PAGESIZE" is mapped at w
810 * but w < v.
811 */
812 do {
813 change = 0;
814 for (va = KERNEL_TEXT;
815 *size > 0 && kbm_probe(&va, &len, &pfn, &prot) != 0;
816 va = next_va) {
817
818 next_va = va + len;
819 pfn_addr = pfn_to_pa(pfn);
820 pfn_eaddr = pfn_addr + len;
821
822 if (pfn_addr <= *addr && pfn_eaddr > *addr) {
823 change = 1;
824 while (*size > 0 && len > 0) {
825 *addr += MMU_PAGESIZE;
826 *size -= MMU_PAGESIZE;
827 len -= MMU_PAGESIZE;
828 }
829 }
830 }
831 if (change && prom_debug)
832 prom_printf("\t\ttrim: a=%" PRIx64 ", s=%" PRIx64 "\n",
833 *addr, *size);
834 } while (change);
835
836 /*
837 * Trim pages from the end of the range.
838 */
839 for (va = KERNEL_TEXT;
840 *size > 0 && kbm_probe(&va, &len, &pfn, &prot) != 0;
841 va = next_va) {
842
843 next_va = va + len;
844 pfn_addr = pfn_to_pa(pfn);
845
846 if (pfn_addr >= *addr && pfn_addr < *addr + *size)
847 *size = pfn_addr - *addr;
848 }
849
850 if (prom_debug)
851 prom_printf("\tFilter out: a=%" PRIx64 ", s=%" PRIx64 "\n",
852 *addr, *size);
853 }
854
855 static void
kpm_init()856 kpm_init()
857 {
858 struct segkpm_crargs b;
859
860 /*
861 * These variables were all designed for sfmmu in which segkpm is
862 * mapped using a single pagesize - either 8KB or 4MB. On x86, we
863 * might use 2+ page sizes on a single machine, so none of these
864 * variables have a single correct value. They are set up as if we
865 * always use a 4KB pagesize, which should do no harm. In the long
866 * run, we should get rid of KPM's assumption that only a single
867 * pagesize is used.
868 */
869 kpm_pgshft = MMU_PAGESHIFT;
870 kpm_pgsz = MMU_PAGESIZE;
871 kpm_pgoff = MMU_PAGEOFFSET;
872 kpmp2pshft = 0;
873 kpmpnpgs = 1;
874 ASSERT(((uintptr_t)kpm_vbase & (kpm_pgsz - 1)) == 0);
875
876 PRM_POINT("about to create segkpm");
877 rw_enter(&kas.a_lock, RW_WRITER);
878
879 if (seg_attach(&kas, kpm_vbase, kpm_size, segkpm) < 0)
880 panic("cannot attach segkpm");
881
882 b.prot = PROT_READ | PROT_WRITE;
883 b.nvcolors = 1;
884
885 if (segkpm_create(segkpm, (caddr_t)&b) != 0)
886 panic("segkpm_create segkpm");
887
888 rw_exit(&kas.a_lock);
889 }
890
891 /*
892 * The debug info page provides enough information to allow external
893 * inspectors (e.g. when running under a hypervisor) to bootstrap
894 * themselves into allowing full-blown kernel debugging.
895 */
896 static void
init_debug_info(void)897 init_debug_info(void)
898 {
899 caddr_t mem;
900 debug_info_t *di;
901
902 #ifndef __lint
903 ASSERT(sizeof (debug_info_t) < MMU_PAGESIZE);
904 #endif
905
906 mem = BOP_ALLOC(bootops, (caddr_t)DEBUG_INFO_VA, MMU_PAGESIZE,
907 MMU_PAGESIZE);
908
909 if (mem != (caddr_t)DEBUG_INFO_VA)
910 panic("BOP_ALLOC() failed");
911 bzero(mem, MMU_PAGESIZE);
912
913 di = (debug_info_t *)mem;
914
915 di->di_magic = DEBUG_INFO_MAGIC;
916 di->di_version = DEBUG_INFO_VERSION;
917 di->di_modules = (uintptr_t)&modules;
918 di->di_s_text = (uintptr_t)s_text;
919 di->di_e_text = (uintptr_t)e_text;
920 di->di_s_data = (uintptr_t)s_data;
921 di->di_e_data = (uintptr_t)e_data;
922 di->di_hat_htable_off = offsetof(hat_t, hat_htable);
923 di->di_ht_pfn_off = offsetof(htable_t, ht_pfn);
924 }
925
926 /*
927 * Build the memlists and other kernel essential memory system data structures.
928 * This is everything at valloc_base.
929 */
930 static void
startup_memlist(void)931 startup_memlist(void)
932 {
933 size_t memlist_sz;
934 size_t memseg_sz;
935 size_t pagehash_sz;
936 size_t pp_sz;
937 uintptr_t va;
938 size_t len;
939 uint_t prot;
940 pfn_t pfn;
941 int memblocks;
942 pfn_t rsvd_high_pfn;
943 pgcnt_t rsvd_pgcnt;
944 size_t rsvdmemlist_sz;
945 int rsvdmemblocks;
946 caddr_t pagecolor_mem;
947 size_t pagecolor_memsz;
948 caddr_t page_ctrs_mem;
949 size_t page_ctrs_size;
950 size_t pse_table_alloc_size;
951 struct memlist *current;
952 extern void startup_build_mem_nodes(struct memlist *);
953
954 /* XX64 fix these - they should be in include files */
955 extern size_t page_coloring_init(uint_t, int, int);
956 extern void page_coloring_setup(caddr_t);
957
958 PRM_POINT("startup_memlist() starting...");
959
960 /*
961 * Use leftover large page nucleus text/data space for loadable modules.
962 * Use at most MODTEXT/MODDATA.
963 */
964 len = kbm_nucleus_size;
965 ASSERT(len > MMU_PAGESIZE);
966
967 moddata = (caddr_t)ROUND_UP_PAGE(e_data);
968 e_moddata = (caddr_t)P2ROUNDUP((uintptr_t)e_data, (uintptr_t)len);
969 if (e_moddata - moddata > MODDATA)
970 e_moddata = moddata + MODDATA;
971
972 modtext = (caddr_t)ROUND_UP_PAGE(e_text);
973 e_modtext = (caddr_t)P2ROUNDUP((uintptr_t)e_text, (uintptr_t)len);
974 if (e_modtext - modtext > MODTEXT)
975 e_modtext = modtext + MODTEXT;
976
977 econtig = e_moddata;
978
979 PRM_DEBUG(modtext);
980 PRM_DEBUG(e_modtext);
981 PRM_DEBUG(moddata);
982 PRM_DEBUG(e_moddata);
983 PRM_DEBUG(econtig);
984
985 /*
986 * Examine the boot loader physical memory map to find out:
987 * - total memory in system - physinstalled
988 * - the max physical address - physmax
989 * - the number of discontiguous segments of memory.
990 */
991 if (prom_debug)
992 print_memlist("boot physinstalled",
993 bootops->boot_mem->physinstalled);
994 installed_top_size_ex(bootops->boot_mem->physinstalled, &physmax,
995 &physinstalled, &memblocks);
996 PRM_DEBUG(physmax);
997 PRM_DEBUG(physinstalled);
998 PRM_DEBUG(memblocks);
999
1000 /*
1001 * Compute maximum physical address for memory DR operations.
1002 * Memory DR operations are unsupported on xpv or 32bit OSes.
1003 */
1004 #ifdef __amd64
1005 if (plat_dr_support_memory()) {
1006 if (plat_dr_physmax == 0) {
1007 uint_t pabits = UINT_MAX;
1008
1009 cpuid_get_addrsize(CPU, &pabits, NULL);
1010 plat_dr_physmax = btop(1ULL << pabits);
1011 }
1012 if (plat_dr_physmax > PHYSMEM_MAX64)
1013 plat_dr_physmax = PHYSMEM_MAX64;
1014 } else
1015 #endif
1016 plat_dr_physmax = 0;
1017
1018 /*
1019 * Examine the bios reserved memory to find out:
1020 * - the number of discontiguous segments of memory.
1021 */
1022 if (prom_debug)
1023 print_memlist("boot reserved mem",
1024 bootops->boot_mem->rsvdmem);
1025 installed_top_size_ex(bootops->boot_mem->rsvdmem, &rsvd_high_pfn,
1026 &rsvd_pgcnt, &rsvdmemblocks);
1027 PRM_DEBUG(rsvd_high_pfn);
1028 PRM_DEBUG(rsvd_pgcnt);
1029 PRM_DEBUG(rsvdmemblocks);
1030
1031 /*
1032 * Initialize hat's mmu parameters.
1033 * Check for enforce-prot-exec in boot environment. It's used to
1034 * enable/disable support for the page table entry NX bit.
1035 * The default is to enforce PROT_EXEC on processors that support NX.
1036 * Boot seems to round up the "len", but 8 seems to be big enough.
1037 */
1038 mmu_init();
1039
1040 #ifdef __i386
1041 /*
1042 * physmax is lowered if there is more memory than can be
1043 * physically addressed in 32 bit (PAE/non-PAE) modes.
1044 */
1045 if (mmu.pae_hat) {
1046 if (PFN_ABOVE64G(physmax)) {
1047 physinstalled -= (physmax - (PFN_64G - 1));
1048 physmax = PFN_64G - 1;
1049 }
1050 } else {
1051 if (PFN_ABOVE4G(physmax)) {
1052 physinstalled -= (physmax - (PFN_4G - 1));
1053 physmax = PFN_4G - 1;
1054 }
1055 }
1056 #endif
1057
1058 startup_build_mem_nodes(bootops->boot_mem->physinstalled);
1059
1060 if (BOP_GETPROPLEN(bootops, "enforce-prot-exec") >= 0) {
1061 int len = BOP_GETPROPLEN(bootops, "enforce-prot-exec");
1062 char value[8];
1063
1064 if (len < 8)
1065 (void) BOP_GETPROP(bootops, "enforce-prot-exec", value);
1066 else
1067 (void) strcpy(value, "");
1068 if (strcmp(value, "off") == 0)
1069 mmu.pt_nx = 0;
1070 }
1071 PRM_DEBUG(mmu.pt_nx);
1072
1073 /*
1074 * We will need page_t's for every page in the system, except for
1075 * memory mapped at or above above the start of the kernel text segment.
1076 *
1077 * pages above e_modtext are attributed to kernel debugger (obp_pages)
1078 */
1079 npages = physinstalled - 1; /* avail_filter() skips page 0, so "- 1" */
1080 obp_pages = 0;
1081 va = KERNEL_TEXT;
1082 while (kbm_probe(&va, &len, &pfn, &prot) != 0) {
1083 npages -= len >> MMU_PAGESHIFT;
1084 if (va >= (uintptr_t)e_moddata)
1085 obp_pages += len >> MMU_PAGESHIFT;
1086 va += len;
1087 }
1088 PRM_DEBUG(npages);
1089 PRM_DEBUG(obp_pages);
1090
1091 /*
1092 * If physmem is patched to be non-zero, use it instead of the computed
1093 * value unless it is larger than the actual amount of memory on hand.
1094 */
1095 if (physmem == 0 || physmem > npages) {
1096 physmem = npages;
1097 } else if (physmem < npages) {
1098 orig_npages = npages;
1099 npages = physmem;
1100 }
1101 PRM_DEBUG(physmem);
1102
1103 /*
1104 * We now compute the sizes of all the initial allocations for
1105 * structures the kernel needs in order do kmem_alloc(). These
1106 * include:
1107 * memsegs
1108 * memlists
1109 * page hash table
1110 * page_t's
1111 * page coloring data structs
1112 */
1113 memseg_sz = sizeof (struct memseg) * (memblocks + POSS_NEW_FRAGMENTS);
1114 ADD_TO_ALLOCATIONS(memseg_base, memseg_sz);
1115 PRM_DEBUG(memseg_sz);
1116
1117 /*
1118 * Reserve space for memlists. There's no real good way to know exactly
1119 * how much room we'll need, but this should be a good upper bound.
1120 */
1121 memlist_sz = ROUND_UP_PAGE(2 * sizeof (struct memlist) *
1122 (memblocks + POSS_NEW_FRAGMENTS));
1123 ADD_TO_ALLOCATIONS(memlist, memlist_sz);
1124 PRM_DEBUG(memlist_sz);
1125
1126 /*
1127 * Reserve space for bios reserved memlists.
1128 */
1129 rsvdmemlist_sz = ROUND_UP_PAGE(2 * sizeof (struct memlist) *
1130 (rsvdmemblocks + POSS_NEW_FRAGMENTS));
1131 ADD_TO_ALLOCATIONS(bios_rsvd, rsvdmemlist_sz);
1132 PRM_DEBUG(rsvdmemlist_sz);
1133
1134 /* LINTED */
1135 ASSERT(P2SAMEHIGHBIT((1 << PP_SHIFT), sizeof (struct page)));
1136 /*
1137 * The page structure hash table size is a power of 2
1138 * such that the average hash chain length is PAGE_HASHAVELEN.
1139 */
1140 page_hashsz = npages / PAGE_HASHAVELEN;
1141 page_hashsz_shift = highbit(page_hashsz);
1142 page_hashsz = 1 << page_hashsz_shift;
1143 pagehash_sz = sizeof (struct page *) * page_hashsz;
1144 ADD_TO_ALLOCATIONS(page_hash, pagehash_sz);
1145 PRM_DEBUG(pagehash_sz);
1146
1147 /*
1148 * Set aside room for the page structures themselves.
1149 */
1150 PRM_DEBUG(npages);
1151 pp_sz = sizeof (struct page) * npages;
1152 ADD_TO_ALLOCATIONS(pp_base, pp_sz);
1153 PRM_DEBUG(pp_sz);
1154
1155 /*
1156 * determine l2 cache info and memory size for page coloring
1157 */
1158 (void) getl2cacheinfo(CPU,
1159 &l2cache_sz, &l2cache_linesz, &l2cache_assoc);
1160 pagecolor_memsz =
1161 page_coloring_init(l2cache_sz, l2cache_linesz, l2cache_assoc);
1162 ADD_TO_ALLOCATIONS(pagecolor_mem, pagecolor_memsz);
1163 PRM_DEBUG(pagecolor_memsz);
1164
1165 page_ctrs_size = page_ctrs_sz();
1166 ADD_TO_ALLOCATIONS(page_ctrs_mem, page_ctrs_size);
1167 PRM_DEBUG(page_ctrs_size);
1168
1169 /*
1170 * Allocate the array that protects pp->p_selock.
1171 */
1172 pse_shift = size_pse_array(physmem, max_ncpus);
1173 pse_table_size = 1 << pse_shift;
1174 pse_table_alloc_size = pse_table_size * sizeof (pad_mutex_t);
1175 ADD_TO_ALLOCATIONS(pse_mutex, pse_table_alloc_size);
1176
1177 #if defined(__amd64)
1178 valloc_sz = ROUND_UP_LPAGE(valloc_sz);
1179 valloc_base = VALLOC_BASE;
1180
1181 /*
1182 * The default values of VALLOC_BASE and SEGKPM_BASE should work
1183 * for values of physmax up to 1 Terabyte. They need adjusting when
1184 * memory is at addresses above 1 TB. When adjusted, segkpm_base must
1185 * be aligned on KERNEL_REDZONE_SIZE boundary (span of top level pte).
1186 */
1187 if (physmax + 1 > mmu_btop(TERABYTE) ||
1188 plat_dr_physmax > mmu_btop(TERABYTE)) {
1189 uint64_t kpm_resv_amount = mmu_ptob(physmax + 1);
1190
1191 if (kpm_resv_amount < mmu_ptob(plat_dr_physmax)) {
1192 kpm_resv_amount = mmu_ptob(plat_dr_physmax);
1193 }
1194
1195 segkpm_base = -(P2ROUNDUP((2 * kpm_resv_amount),
1196 KERNEL_REDZONE_SIZE)); /* down from top VA */
1197
1198 /* make sure we leave some space for user apps above hole */
1199 segkpm_base = MAX(segkpm_base, AMD64_VA_HOLE_END + TERABYTE);
1200 if (segkpm_base > SEGKPM_BASE)
1201 segkpm_base = SEGKPM_BASE;
1202 PRM_DEBUG(segkpm_base);
1203
1204 valloc_base = segkpm_base + P2ROUNDUP(kpm_resv_amount, ONE_GIG);
1205 if (valloc_base < segkpm_base)
1206 panic("not enough kernel VA to support memory size");
1207 PRM_DEBUG(valloc_base);
1208 }
1209 #else /* __i386 */
1210 valloc_base = (uintptr_t)(MISC_VA_BASE - valloc_sz);
1211 valloc_base = P2ALIGN(valloc_base, mmu.level_size[1]);
1212 PRM_DEBUG(valloc_base);
1213 #endif /* __i386 */
1214
1215 /*
1216 * do all the initial allocations
1217 */
1218 perform_allocations();
1219
1220 /*
1221 * Build phys_install and phys_avail in kernel memspace.
1222 * - phys_install should be all memory in the system.
1223 * - phys_avail is phys_install minus any memory mapped before this
1224 * point above KERNEL_TEXT.
1225 */
1226 current = phys_install = memlist;
1227 copy_memlist_filter(bootops->boot_mem->physinstalled, ¤t, NULL);
1228 if ((caddr_t)current > (caddr_t)memlist + memlist_sz)
1229 panic("physinstalled was too big!");
1230 if (prom_debug)
1231 print_memlist("phys_install", phys_install);
1232
1233 phys_avail = current;
1234 PRM_POINT("Building phys_avail:\n");
1235 copy_memlist_filter(bootops->boot_mem->physinstalled, ¤t,
1236 avail_filter);
1237 if ((caddr_t)current > (caddr_t)memlist + memlist_sz)
1238 panic("physavail was too big!");
1239 if (prom_debug)
1240 print_memlist("phys_avail", phys_avail);
1241 #ifndef __xpv
1242 /*
1243 * Free unused memlist items, which may be used by memory DR driver
1244 * at runtime.
1245 */
1246 if ((caddr_t)current < (caddr_t)memlist + memlist_sz) {
1247 memlist_free_block((caddr_t)current,
1248 (caddr_t)memlist + memlist_sz - (caddr_t)current);
1249 }
1250 #endif
1251
1252 /*
1253 * Build bios reserved memspace
1254 */
1255 current = bios_rsvd;
1256 copy_memlist_filter(bootops->boot_mem->rsvdmem, ¤t, NULL);
1257 if ((caddr_t)current > (caddr_t)bios_rsvd + rsvdmemlist_sz)
1258 panic("bios_rsvd was too big!");
1259 if (prom_debug)
1260 print_memlist("bios_rsvd", bios_rsvd);
1261 #ifndef __xpv
1262 /*
1263 * Free unused memlist items, which may be used by memory DR driver
1264 * at runtime.
1265 */
1266 if ((caddr_t)current < (caddr_t)bios_rsvd + rsvdmemlist_sz) {
1267 memlist_free_block((caddr_t)current,
1268 (caddr_t)bios_rsvd + rsvdmemlist_sz - (caddr_t)current);
1269 }
1270 #endif
1271
1272 /*
1273 * setup page coloring
1274 */
1275 page_coloring_setup(pagecolor_mem);
1276 page_lock_init(); /* currently a no-op */
1277
1278 /*
1279 * free page list counters
1280 */
1281 (void) page_ctrs_alloc(page_ctrs_mem);
1282
1283 /*
1284 * Size the pcf array based on the number of cpus in the box at
1285 * boot time.
1286 */
1287
1288 pcf_init();
1289
1290 /*
1291 * Initialize the page structures from the memory lists.
1292 */
1293 availrmem_initial = availrmem = freemem = 0;
1294 PRM_POINT("Calling kphysm_init()...");
1295 npages = kphysm_init(pp_base, npages);
1296 PRM_POINT("kphysm_init() done");
1297 PRM_DEBUG(npages);
1298
1299 init_debug_info();
1300
1301 /*
1302 * Now that page_t's have been initialized, remove all the
1303 * initial allocation pages from the kernel free page lists.
1304 */
1305 boot_mapin((caddr_t)valloc_base, valloc_sz);
1306 boot_mapin((caddr_t)MISC_VA_BASE, MISC_VA_SIZE);
1307 PRM_POINT("startup_memlist() done");
1308
1309 PRM_DEBUG(valloc_sz);
1310
1311 #if defined(__amd64)
1312 if ((availrmem >> (30 - MMU_PAGESHIFT)) >=
1313 textrepl_min_gb && l2cache_sz <= 2 << 20) {
1314 extern size_t textrepl_size_thresh;
1315 textrepl_size_thresh = (16 << 20) - 1;
1316 }
1317 #endif
1318 }
1319
1320 /*
1321 * Layout the kernel's part of address space and initialize kmem allocator.
1322 */
1323 static void
startup_kmem(void)1324 startup_kmem(void)
1325 {
1326 extern void page_set_colorequiv_arr(void);
1327
1328 PRM_POINT("startup_kmem() starting...");
1329
1330 #if defined(__amd64)
1331 if (eprom_kernelbase && eprom_kernelbase != KERNELBASE)
1332 cmn_err(CE_NOTE, "!kernelbase cannot be changed on 64-bit "
1333 "systems.");
1334 kernelbase = segkpm_base - KERNEL_REDZONE_SIZE;
1335 core_base = (uintptr_t)COREHEAP_BASE;
1336 core_size = (size_t)MISC_VA_BASE - COREHEAP_BASE;
1337 #else /* __i386 */
1338 /*
1339 * We configure kernelbase based on:
1340 *
1341 * 1. user specified kernelbase via eeprom command. Value cannot exceed
1342 * KERNELBASE_MAX. we large page align eprom_kernelbase
1343 *
1344 * 2. Default to KERNELBASE and adjust to 2X less the size for page_t.
1345 * On large memory systems we must lower kernelbase to allow
1346 * enough room for page_t's for all of memory.
1347 *
1348 * The value set here, might be changed a little later.
1349 */
1350 if (eprom_kernelbase) {
1351 kernelbase = eprom_kernelbase & mmu.level_mask[1];
1352 if (kernelbase > KERNELBASE_MAX)
1353 kernelbase = KERNELBASE_MAX;
1354 } else {
1355 kernelbase = (uintptr_t)KERNELBASE;
1356 kernelbase -= ROUND_UP_4MEG(2 * valloc_sz);
1357 }
1358 ASSERT((kernelbase & mmu.level_offset[1]) == 0);
1359 core_base = valloc_base;
1360 core_size = 0;
1361 #endif /* __i386 */
1362
1363 PRM_DEBUG(core_base);
1364 PRM_DEBUG(core_size);
1365 PRM_DEBUG(kernelbase);
1366
1367 #if defined(__i386)
1368 segkp_fromheap = 1;
1369 #endif /* __i386 */
1370
1371 ekernelheap = (char *)core_base;
1372 PRM_DEBUG(ekernelheap);
1373
1374 /*
1375 * Now that we know the real value of kernelbase,
1376 * update variables that were initialized with a value of
1377 * KERNELBASE (in common/conf/param.c).
1378 *
1379 * XXX The problem with this sort of hackery is that the
1380 * compiler just may feel like putting the const declarations
1381 * (in param.c) into the .text section. Perhaps they should
1382 * just be declared as variables there?
1383 */
1384
1385 *(uintptr_t *)&_kernelbase = kernelbase;
1386 *(uintptr_t *)&_userlimit = kernelbase;
1387 #if defined(__amd64)
1388 *(uintptr_t *)&_userlimit -= KERNELBASE - USERLIMIT;
1389 #else
1390 *(uintptr_t *)&_userlimit32 = _userlimit;
1391 #endif
1392 PRM_DEBUG(_kernelbase);
1393 PRM_DEBUG(_userlimit);
1394 PRM_DEBUG(_userlimit32);
1395
1396 layout_kernel_va();
1397
1398 #if defined(__i386)
1399 /*
1400 * If segmap is too large we can push the bottom of the kernel heap
1401 * higher than the base. Or worse, it could exceed the top of the
1402 * VA space entirely, causing it to wrap around.
1403 */
1404 if (kernelheap >= ekernelheap || (uintptr_t)kernelheap < kernelbase)
1405 panic("too little address space available for kernelheap,"
1406 " use eeprom for lower kernelbase or smaller segmapsize");
1407 #endif /* __i386 */
1408
1409 /*
1410 * Initialize the kernel heap. Note 3rd argument must be > 1st.
1411 */
1412 kernelheap_init(kernelheap, ekernelheap,
1413 kernelheap + MMU_PAGESIZE,
1414 (void *)core_base, (void *)(core_base + core_size));
1415
1416 #if defined(__xpv)
1417 /*
1418 * Link pending events struct into cpu struct
1419 */
1420 CPU->cpu_m.mcpu_evt_pend = &cpu0_evt_data;
1421 #endif
1422 /*
1423 * Initialize kernel memory allocator.
1424 */
1425 kmem_init();
1426
1427 /*
1428 * Factor in colorequiv to check additional 'equivalent' bins
1429 */
1430 page_set_colorequiv_arr();
1431
1432 /*
1433 * print this out early so that we know what's going on
1434 */
1435 print_x86_featureset(x86_featureset);
1436
1437 /*
1438 * Initialize bp_mapin().
1439 */
1440 bp_init(MMU_PAGESIZE, HAT_STORECACHING_OK);
1441
1442 /*
1443 * orig_npages is non-zero if physmem has been configured for less
1444 * than the available memory.
1445 */
1446 if (orig_npages) {
1447 cmn_err(CE_WARN, "!%slimiting physmem to 0x%lx of 0x%lx pages",
1448 (npages == PHYSMEM ? "Due to virtual address space " : ""),
1449 npages, orig_npages);
1450 }
1451 #if defined(__i386)
1452 if (eprom_kernelbase && (eprom_kernelbase != kernelbase))
1453 cmn_err(CE_WARN, "kernelbase value, User specified 0x%lx, "
1454 "System using 0x%lx",
1455 (uintptr_t)eprom_kernelbase, (uintptr_t)kernelbase);
1456 #endif
1457
1458 #ifdef KERNELBASE_ABI_MIN
1459 if (kernelbase < (uintptr_t)KERNELBASE_ABI_MIN) {
1460 cmn_err(CE_NOTE, "!kernelbase set to 0x%lx, system is not "
1461 "i386 ABI compliant.", (uintptr_t)kernelbase);
1462 }
1463 #endif
1464
1465 #ifndef __xpv
1466 if (plat_dr_support_memory()) {
1467 mem_config_init();
1468 }
1469 #else /* __xpv */
1470 /*
1471 * Some of the xen start information has to be relocated up
1472 * into the kernel's permanent address space.
1473 */
1474 PRM_POINT("calling xen_relocate_start_info()");
1475 xen_relocate_start_info();
1476 PRM_POINT("xen_relocate_start_info() done");
1477
1478 /*
1479 * (Update the vcpu pointer in our cpu structure to point into
1480 * the relocated shared info.)
1481 */
1482 CPU->cpu_m.mcpu_vcpu_info =
1483 &HYPERVISOR_shared_info->vcpu_info[CPU->cpu_id];
1484 #endif /* __xpv */
1485
1486 PRM_POINT("startup_kmem() done");
1487 }
1488
1489 #ifndef __xpv
1490 /*
1491 * If we have detected that we are running in an HVM environment, we need
1492 * to prepend the PV driver directory to the module search path.
1493 */
1494 #define HVM_MOD_DIR "/platform/i86hvm/kernel"
1495 static void
update_default_path()1496 update_default_path()
1497 {
1498 char *current, *newpath;
1499 int newlen;
1500
1501 /*
1502 * We are about to resync with krtld. krtld will reset its
1503 * internal module search path iff Solaris has set default_path.
1504 * We want to be sure we're prepending this new directory to the
1505 * right search path.
1506 */
1507 current = (default_path == NULL) ? kobj_module_path : default_path;
1508
1509 newlen = strlen(HVM_MOD_DIR) + strlen(current) + 2;
1510 newpath = kmem_alloc(newlen, KM_SLEEP);
1511 (void) strcpy(newpath, HVM_MOD_DIR);
1512 (void) strcat(newpath, " ");
1513 (void) strcat(newpath, current);
1514
1515 default_path = newpath;
1516 }
1517 #endif
1518
1519 static void
startup_modules(void)1520 startup_modules(void)
1521 {
1522 int cnt;
1523 extern void prom_setup(void);
1524 int32_t v, h;
1525 char d[11];
1526 char *cp;
1527 cmi_hdl_t hdl;
1528
1529 PRM_POINT("startup_modules() starting...");
1530
1531 #ifndef __xpv
1532 /*
1533 * Initialize ten-micro second timer so that drivers will
1534 * not get short changed in their init phase. This was
1535 * not getting called until clkinit which, on fast cpu's
1536 * caused the drv_usecwait to be way too short.
1537 */
1538 microfind();
1539
1540 if ((get_hwenv() & HW_XEN_HVM) != 0)
1541 update_default_path();
1542 #endif
1543
1544 /*
1545 * Read the GMT lag from /etc/rtc_config.
1546 */
1547 sgmtl(process_rtc_config_file());
1548
1549 /*
1550 * Calculate default settings of system parameters based upon
1551 * maxusers, yet allow to be overridden via the /etc/system file.
1552 */
1553 param_calc(0);
1554
1555 mod_setup();
1556
1557 /*
1558 * Initialize system parameters.
1559 */
1560 param_init();
1561
1562 /*
1563 * Initialize the default brands
1564 */
1565 brand_init();
1566
1567 /*
1568 * maxmem is the amount of physical memory we're playing with.
1569 */
1570 maxmem = physmem;
1571
1572 /*
1573 * Initialize segment management stuff.
1574 */
1575 seg_init();
1576
1577 if (modload("fs", "specfs") == -1)
1578 halt("Can't load specfs");
1579
1580 if (modload("fs", "devfs") == -1)
1581 halt("Can't load devfs");
1582
1583 if (modload("fs", "dev") == -1)
1584 halt("Can't load dev");
1585
1586 if (modload("fs", "procfs") == -1)
1587 halt("Can't load procfs");
1588
1589 (void) modloadonly("sys", "lbl_edition");
1590
1591 dispinit();
1592
1593 /* Read cluster configuration data. */
1594 clconf_init();
1595
1596 #if defined(__xpv)
1597 (void) ec_init();
1598 gnttab_init();
1599 (void) xs_early_init();
1600 #endif /* __xpv */
1601
1602 /*
1603 * Create a kernel device tree. First, create rootnex and
1604 * then invoke bus specific code to probe devices.
1605 */
1606 setup_ddi();
1607
1608 #ifdef __xpv
1609 if (DOMAIN_IS_INITDOMAIN(xen_info))
1610 #endif
1611 {
1612 /*
1613 * Load the System Management BIOS into the global ksmbios
1614 * handle, if an SMBIOS is present on this system.
1615 */
1616 ksmbios = smbios_open(NULL, SMB_VERSION, ksmbios_flags, NULL);
1617 }
1618
1619
1620 /*
1621 * Originally clconf_init() apparently needed the hostid. But
1622 * this no longer appears to be true - it uses its own nodeid.
1623 * By placing the hostid logic here, we are able to make use of
1624 * the SMBIOS UUID.
1625 */
1626 if ((h = set_soft_hostid()) == HW_INVALID_HOSTID) {
1627 cmn_err(CE_WARN, "Unable to set hostid");
1628 } else {
1629 for (v = h, cnt = 0; cnt < 10; cnt++) {
1630 d[cnt] = (char)(v % 10);
1631 v /= 10;
1632 if (v == 0)
1633 break;
1634 }
1635 for (cp = hw_serial; cnt >= 0; cnt--)
1636 *cp++ = d[cnt] + '0';
1637 *cp = 0;
1638 }
1639
1640 /*
1641 * Set up the CPU module subsystem for the boot cpu in the native
1642 * case, and all physical cpu resource in the xpv dom0 case.
1643 * Modifies the device tree, so this must be done after
1644 * setup_ddi().
1645 */
1646 #ifdef __xpv
1647 /*
1648 * If paravirtualized and on dom0 then we initialize all physical
1649 * cpu handles now; if paravirtualized on a domU then do not
1650 * initialize.
1651 */
1652 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1653 xen_mc_lcpu_cookie_t cpi;
1654
1655 for (cpi = xen_physcpu_next(NULL); cpi != NULL;
1656 cpi = xen_physcpu_next(cpi)) {
1657 if ((hdl = cmi_init(CMI_HDL_SOLARIS_xVM_MCA,
1658 xen_physcpu_chipid(cpi), xen_physcpu_coreid(cpi),
1659 xen_physcpu_strandid(cpi))) != NULL &&
1660 is_x86_feature(x86_featureset, X86FSET_MCA))
1661 cmi_mca_init(hdl);
1662 }
1663 }
1664 #else
1665 /*
1666 * Initialize a handle for the boot cpu - others will initialize
1667 * as they startup. Do not do this if we know we are in an HVM domU.
1668 */
1669 if ((get_hwenv() & HW_XEN_HVM) == 0 &&
1670 (hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(CPU),
1671 cmi_ntv_hwcoreid(CPU), cmi_ntv_hwstrandid(CPU))) != NULL &&
1672 is_x86_feature(x86_featureset, X86FSET_MCA)) {
1673 cmi_mca_init(hdl);
1674 CPU->cpu_m.mcpu_cmi_hdl = hdl;
1675 }
1676 #endif /* __xpv */
1677
1678 /*
1679 * Fake a prom tree such that /dev/openprom continues to work
1680 */
1681 PRM_POINT("startup_modules: calling prom_setup...");
1682 prom_setup();
1683 PRM_POINT("startup_modules: done");
1684
1685 /*
1686 * Load all platform specific modules
1687 */
1688 PRM_POINT("startup_modules: calling psm_modload...");
1689 psm_modload();
1690
1691 PRM_POINT("startup_modules() done");
1692 }
1693
1694 /*
1695 * claim a "setaside" boot page for use in the kernel
1696 */
1697 page_t *
boot_claim_page(pfn_t pfn)1698 boot_claim_page(pfn_t pfn)
1699 {
1700 page_t *pp;
1701
1702 pp = page_numtopp_nolock(pfn);
1703 ASSERT(pp != NULL);
1704
1705 if (PP_ISBOOTPAGES(pp)) {
1706 if (pp->p_next != NULL)
1707 pp->p_next->p_prev = pp->p_prev;
1708 if (pp->p_prev == NULL)
1709 bootpages = pp->p_next;
1710 else
1711 pp->p_prev->p_next = pp->p_next;
1712 } else {
1713 /*
1714 * htable_attach() expects a base pagesize page
1715 */
1716 if (pp->p_szc != 0)
1717 page_boot_demote(pp);
1718 pp = page_numtopp(pfn, SE_EXCL);
1719 }
1720 return (pp);
1721 }
1722
1723 /*
1724 * Walk through the pagetables looking for pages mapped in by boot. If the
1725 * setaside flag is set the pages are expected to be returned to the
1726 * kernel later in boot, so we add them to the bootpages list.
1727 */
1728 static void
protect_boot_range(uintptr_t low,uintptr_t high,int setaside)1729 protect_boot_range(uintptr_t low, uintptr_t high, int setaside)
1730 {
1731 uintptr_t va = low;
1732 size_t len;
1733 uint_t prot;
1734 pfn_t pfn;
1735 page_t *pp;
1736 pgcnt_t boot_protect_cnt = 0;
1737
1738 while (kbm_probe(&va, &len, &pfn, &prot) != 0 && va < high) {
1739 if (va + len >= high)
1740 panic("0x%lx byte mapping at 0x%p exceeds boot's "
1741 "legal range.", len, (void *)va);
1742
1743 while (len > 0) {
1744 pp = page_numtopp_alloc(pfn);
1745 if (pp != NULL) {
1746 if (setaside == 0)
1747 panic("Unexpected mapping by boot. "
1748 "addr=%p pfn=%lx\n",
1749 (void *)va, pfn);
1750
1751 pp->p_next = bootpages;
1752 pp->p_prev = NULL;
1753 PP_SETBOOTPAGES(pp);
1754 if (bootpages != NULL) {
1755 bootpages->p_prev = pp;
1756 }
1757 bootpages = pp;
1758 ++boot_protect_cnt;
1759 }
1760
1761 ++pfn;
1762 len -= MMU_PAGESIZE;
1763 va += MMU_PAGESIZE;
1764 }
1765 }
1766 PRM_DEBUG(boot_protect_cnt);
1767 }
1768
1769 /*
1770 *
1771 */
1772 static void
layout_kernel_va(void)1773 layout_kernel_va(void)
1774 {
1775 PRM_POINT("layout_kernel_va() starting...");
1776 /*
1777 * Establish the final size of the kernel's heap, size of segmap,
1778 * segkp, etc.
1779 */
1780
1781 #if defined(__amd64)
1782
1783 kpm_vbase = (caddr_t)segkpm_base;
1784 if (physmax + 1 < plat_dr_physmax) {
1785 kpm_size = ROUND_UP_LPAGE(mmu_ptob(plat_dr_physmax));
1786 } else {
1787 kpm_size = ROUND_UP_LPAGE(mmu_ptob(physmax + 1));
1788 }
1789 if ((uintptr_t)kpm_vbase + kpm_size > (uintptr_t)valloc_base)
1790 panic("not enough room for kpm!");
1791 PRM_DEBUG(kpm_size);
1792 PRM_DEBUG(kpm_vbase);
1793
1794 /*
1795 * By default we create a seg_kp in 64 bit kernels, it's a little
1796 * faster to access than embedding it in the heap.
1797 */
1798 segkp_base = (caddr_t)valloc_base + valloc_sz;
1799 if (!segkp_fromheap) {
1800 size_t sz = mmu_ptob(segkpsize);
1801
1802 /*
1803 * determine size of segkp
1804 */
1805 if (sz < SEGKPMINSIZE || sz > SEGKPMAXSIZE) {
1806 sz = SEGKPDEFSIZE;
1807 cmn_err(CE_WARN, "!Illegal value for segkpsize. "
1808 "segkpsize has been reset to %ld pages",
1809 mmu_btop(sz));
1810 }
1811 sz = MIN(sz, MAX(SEGKPMINSIZE, mmu_ptob(physmem)));
1812
1813 segkpsize = mmu_btop(ROUND_UP_LPAGE(sz));
1814 }
1815 PRM_DEBUG(segkp_base);
1816 PRM_DEBUG(segkpsize);
1817
1818 /*
1819 * segzio is used for ZFS cached data. It uses a distinct VA
1820 * segment (from kernel heap) so that we can easily tell not to
1821 * include it in kernel crash dumps on 64 bit kernels. The trick is
1822 * to give it lots of VA, but not constrain the kernel heap.
1823 * We scale the size of segzio linearly with physmem up to
1824 * SEGZIOMAXSIZE. Above that amount it scales at 50% of physmem.
1825 */
1826 segzio_base = segkp_base + SEGKPMAXSIZE;
1827 if (segzio_fromheap) {
1828 segziosize = 0;
1829 } else {
1830 size_t physmem_size = mmu_ptob(physmem);
1831 size_t size = (segziosize == 0) ?
1832 physmem_size : mmu_ptob(segziosize);
1833
1834 if (size < SEGZIOMINSIZE)
1835 size = SEGZIOMINSIZE;
1836 if (size > SEGZIOMAXSIZE) {
1837 size = SEGZIOMAXSIZE;
1838 if (physmem_size > size)
1839 size += (physmem_size - size) / 2;
1840 }
1841 segziosize = mmu_btop(ROUND_UP_LPAGE(size));
1842 }
1843 PRM_DEBUG(segziosize);
1844 PRM_DEBUG(segzio_base);
1845
1846 /*
1847 * Put the range of VA for device mappings next, kmdb knows to not
1848 * grep in this range of addresses.
1849 */
1850 toxic_addr =
1851 ROUND_UP_LPAGE((uintptr_t)segzio_base + mmu_ptob(segziosize));
1852 PRM_DEBUG(toxic_addr);
1853 segmap_start = ROUND_UP_LPAGE(toxic_addr + toxic_size);
1854 #else /* __i386 */
1855 segmap_start = ROUND_UP_LPAGE(kernelbase);
1856 #endif /* __i386 */
1857 PRM_DEBUG(segmap_start);
1858
1859 /*
1860 * Users can change segmapsize through eeprom. If the variable
1861 * is tuned through eeprom, there is no upper bound on the
1862 * size of segmap.
1863 */
1864 segmapsize = MAX(ROUND_UP_LPAGE(segmapsize), SEGMAPDEFAULT);
1865
1866 #if defined(__i386)
1867 /*
1868 * 32-bit systems don't have segkpm or segkp, so segmap appears at
1869 * the bottom of the kernel's address range. Set aside space for a
1870 * small red zone just below the start of segmap.
1871 */
1872 segmap_start += KERNEL_REDZONE_SIZE;
1873 segmapsize -= KERNEL_REDZONE_SIZE;
1874 #endif
1875
1876 PRM_DEBUG(segmap_start);
1877 PRM_DEBUG(segmapsize);
1878 kernelheap = (caddr_t)ROUND_UP_LPAGE(segmap_start + segmapsize);
1879 PRM_DEBUG(kernelheap);
1880 PRM_POINT("layout_kernel_va() done...");
1881 }
1882
1883 /*
1884 * Finish initializing the VM system, now that we are no longer
1885 * relying on the boot time memory allocators.
1886 */
1887 static void
startup_vm(void)1888 startup_vm(void)
1889 {
1890 struct segmap_crargs a;
1891
1892 extern int use_brk_lpg, use_stk_lpg;
1893
1894 PRM_POINT("startup_vm() starting...");
1895
1896 /*
1897 * Initialize the hat layer.
1898 */
1899 hat_init();
1900
1901 /*
1902 * Do final allocations of HAT data structures that need to
1903 * be allocated before quiescing the boot loader.
1904 */
1905 PRM_POINT("Calling hat_kern_alloc()...");
1906 hat_kern_alloc((caddr_t)segmap_start, segmapsize, ekernelheap);
1907 PRM_POINT("hat_kern_alloc() done");
1908
1909 #ifndef __xpv
1910 /*
1911 * Setup Page Attribute Table
1912 */
1913 pat_sync();
1914 #endif
1915
1916 /*
1917 * The next two loops are done in distinct steps in order
1918 * to be sure that any page that is doubly mapped (both above
1919 * KERNEL_TEXT and below kernelbase) is dealt with correctly.
1920 * Note this may never happen, but it might someday.
1921 */
1922 bootpages = NULL;
1923 PRM_POINT("Protecting boot pages");
1924
1925 /*
1926 * Protect any pages mapped above KERNEL_TEXT that somehow have
1927 * page_t's. This can only happen if something weird allocated
1928 * in this range (like kadb/kmdb).
1929 */
1930 protect_boot_range(KERNEL_TEXT, (uintptr_t)-1, 0);
1931
1932 /*
1933 * Before we can take over memory allocation/mapping from the boot
1934 * loader we must remove from our free page lists any boot allocated
1935 * pages that stay mapped until release_bootstrap().
1936 */
1937 protect_boot_range(0, kernelbase, 1);
1938
1939
1940 /*
1941 * Switch to running on regular HAT (not boot_mmu)
1942 */
1943 PRM_POINT("Calling hat_kern_setup()...");
1944 hat_kern_setup();
1945
1946 /*
1947 * It is no longer safe to call BOP_ALLOC(), so make sure we don't.
1948 */
1949 bop_no_more_mem();
1950
1951 PRM_POINT("hat_kern_setup() done");
1952
1953 hat_cpu_online(CPU);
1954
1955 /*
1956 * Initialize VM system
1957 */
1958 PRM_POINT("Calling kvm_init()...");
1959 kvm_init();
1960 PRM_POINT("kvm_init() done");
1961
1962 /*
1963 * Tell kmdb that the VM system is now working
1964 */
1965 if (boothowto & RB_DEBUG)
1966 kdi_dvec_vmready();
1967
1968 #if defined(__xpv)
1969 /*
1970 * Populate the I/O pool on domain 0
1971 */
1972 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1973 extern long populate_io_pool(void);
1974 long init_io_pool_cnt;
1975
1976 PRM_POINT("Populating reserve I/O page pool");
1977 init_io_pool_cnt = populate_io_pool();
1978 PRM_DEBUG(init_io_pool_cnt);
1979 }
1980 #endif
1981 /*
1982 * Mangle the brand string etc.
1983 */
1984 cpuid_pass3(CPU);
1985
1986 #if defined(__amd64)
1987
1988 /*
1989 * Create the device arena for toxic (to dtrace/kmdb) mappings.
1990 */
1991 device_arena = vmem_create("device", (void *)toxic_addr,
1992 toxic_size, MMU_PAGESIZE, NULL, NULL, NULL, 0, VM_SLEEP);
1993
1994 #else /* __i386 */
1995
1996 /*
1997 * allocate the bit map that tracks toxic pages
1998 */
1999 toxic_bit_map_len = btop((ulong_t)(valloc_base - kernelbase));
2000 PRM_DEBUG(toxic_bit_map_len);
2001 toxic_bit_map =
2002 kmem_zalloc(BT_SIZEOFMAP(toxic_bit_map_len), KM_NOSLEEP);
2003 ASSERT(toxic_bit_map != NULL);
2004 PRM_DEBUG(toxic_bit_map);
2005
2006 #endif /* __i386 */
2007
2008
2009 /*
2010 * Now that we've got more VA, as well as the ability to allocate from
2011 * it, tell the debugger.
2012 */
2013 if (boothowto & RB_DEBUG)
2014 kdi_dvec_memavail();
2015
2016 /*
2017 * The following code installs a special page fault handler (#pf)
2018 * to work around a pentium bug.
2019 */
2020 #if !defined(__amd64) && !defined(__xpv)
2021 if (x86_type == X86_TYPE_P5) {
2022 desctbr_t idtr;
2023 gate_desc_t *newidt;
2024
2025 if ((newidt = kmem_zalloc(MMU_PAGESIZE, KM_NOSLEEP)) == NULL)
2026 panic("failed to install pentium_pftrap");
2027
2028 bcopy(idt0, newidt, NIDT * sizeof (*idt0));
2029 set_gatesegd(&newidt[T_PGFLT], &pentium_pftrap,
2030 KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
2031
2032 (void) as_setprot(&kas, (caddr_t)newidt, MMU_PAGESIZE,
2033 PROT_READ | PROT_EXEC);
2034
2035 CPU->cpu_idt = newidt;
2036 idtr.dtr_base = (uintptr_t)CPU->cpu_idt;
2037 idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
2038 wr_idtr(&idtr);
2039 }
2040 #endif /* !__amd64 */
2041
2042 #if !defined(__xpv)
2043 /*
2044 * Map page pfn=0 for drivers, such as kd, that need to pick up
2045 * parameters left there by controllers/BIOS.
2046 */
2047 PRM_POINT("setup up p0_va");
2048 p0_va = i86devmap(0, 1, PROT_READ);
2049 PRM_DEBUG(p0_va);
2050 #endif
2051
2052 cmn_err(CE_CONT, "?mem = %luK (0x%lx)\n",
2053 physinstalled << (MMU_PAGESHIFT - 10), ptob(physinstalled));
2054
2055 /*
2056 * disable automatic large pages for small memory systems or
2057 * when the disable flag is set.
2058 *
2059 * Do not yet consider page sizes larger than 2m/4m.
2060 */
2061 if (!auto_lpg_disable && mmu.max_page_level > 0) {
2062 max_uheap_lpsize = LEVEL_SIZE(1);
2063 max_ustack_lpsize = LEVEL_SIZE(1);
2064 max_privmap_lpsize = LEVEL_SIZE(1);
2065 max_uidata_lpsize = LEVEL_SIZE(1);
2066 max_utext_lpsize = LEVEL_SIZE(1);
2067 max_shm_lpsize = LEVEL_SIZE(1);
2068 }
2069 if (physmem < privm_lpg_min_physmem || mmu.max_page_level == 0 ||
2070 auto_lpg_disable) {
2071 use_brk_lpg = 0;
2072 use_stk_lpg = 0;
2073 }
2074 mcntl0_lpsize = LEVEL_SIZE(mmu.umax_page_level);
2075
2076 PRM_POINT("Calling hat_init_finish()...");
2077 hat_init_finish();
2078 PRM_POINT("hat_init_finish() done");
2079
2080 /*
2081 * Initialize the segkp segment type.
2082 */
2083 rw_enter(&kas.a_lock, RW_WRITER);
2084 PRM_POINT("Attaching segkp");
2085 if (segkp_fromheap) {
2086 segkp->s_as = &kas;
2087 } else if (seg_attach(&kas, (caddr_t)segkp_base, mmu_ptob(segkpsize),
2088 segkp) < 0) {
2089 panic("startup: cannot attach segkp");
2090 /*NOTREACHED*/
2091 }
2092 PRM_POINT("Doing segkp_create()");
2093 if (segkp_create(segkp) != 0) {
2094 panic("startup: segkp_create failed");
2095 /*NOTREACHED*/
2096 }
2097 PRM_DEBUG(segkp);
2098 rw_exit(&kas.a_lock);
2099
2100 /*
2101 * kpm segment
2102 */
2103 segmap_kpm = 0;
2104 if (kpm_desired) {
2105 kpm_init();
2106 kpm_enable = 1;
2107 }
2108
2109 /*
2110 * Now create segmap segment.
2111 */
2112 rw_enter(&kas.a_lock, RW_WRITER);
2113 if (seg_attach(&kas, (caddr_t)segmap_start, segmapsize, segmap) < 0) {
2114 panic("cannot attach segmap");
2115 /*NOTREACHED*/
2116 }
2117 PRM_DEBUG(segmap);
2118
2119 a.prot = PROT_READ | PROT_WRITE;
2120 a.shmsize = 0;
2121 a.nfreelist = segmapfreelists;
2122
2123 if (segmap_create(segmap, (caddr_t)&a) != 0)
2124 panic("segmap_create segmap");
2125 rw_exit(&kas.a_lock);
2126
2127 setup_vaddr_for_ppcopy(CPU);
2128
2129 segdev_init();
2130 #if defined(__xpv)
2131 if (DOMAIN_IS_INITDOMAIN(xen_info))
2132 #endif
2133 pmem_init();
2134
2135 PRM_POINT("startup_vm() done");
2136 }
2137
2138 /*
2139 * Load a tod module for the non-standard tod part found on this system.
2140 */
2141 static void
load_tod_module(char * todmod)2142 load_tod_module(char *todmod)
2143 {
2144 if (modload("tod", todmod) == -1)
2145 halt("Can't load TOD module");
2146 }
2147
2148 static void
startup_end(void)2149 startup_end(void)
2150 {
2151 int i;
2152 extern void setx86isalist(void);
2153 extern void cpu_event_init(void);
2154
2155 PRM_POINT("startup_end() starting...");
2156
2157 /*
2158 * Perform tasks that get done after most of the VM
2159 * initialization has been done but before the clock
2160 * and other devices get started.
2161 */
2162 kern_setup1();
2163
2164 /*
2165 * Perform CPC initialization for this CPU.
2166 */
2167 kcpc_hw_init(CPU);
2168
2169 /*
2170 * Initialize cpu event framework.
2171 */
2172 cpu_event_init();
2173
2174 #if defined(OPTERON_WORKAROUND_6323525)
2175 if (opteron_workaround_6323525)
2176 patch_workaround_6323525();
2177 #endif
2178 /*
2179 * If needed, load TOD module now so that ddi_get_time(9F) etc. work
2180 * (For now, "needed" is defined as set tod_module_name in /etc/system)
2181 */
2182 if (tod_module_name != NULL) {
2183 PRM_POINT("load_tod_module()");
2184 load_tod_module(tod_module_name);
2185 }
2186
2187 #if defined(__xpv)
2188 /*
2189 * Forceload interposing TOD module for the hypervisor.
2190 */
2191 PRM_POINT("load_tod_module()");
2192 load_tod_module("xpvtod");
2193 #endif
2194
2195 /*
2196 * Configure the system.
2197 */
2198 PRM_POINT("Calling configure()...");
2199 configure(); /* set up devices */
2200 PRM_POINT("configure() done");
2201
2202 /*
2203 * We can now setup for XSAVE because fpu_probe is done in configure().
2204 */
2205 if (fp_save_mech == FP_XSAVE) {
2206 xsave_setup_msr(CPU);
2207 }
2208
2209 /*
2210 * Set the isa_list string to the defined instruction sets we
2211 * support.
2212 */
2213 setx86isalist();
2214 cpu_intr_alloc(CPU, NINTR_THREADS);
2215 psm_install();
2216
2217 /*
2218 * We're done with bootops. We don't unmap the bootstrap yet because
2219 * we're still using bootsvcs.
2220 */
2221 PRM_POINT("NULLing out bootops");
2222 *bootopsp = (struct bootops *)NULL;
2223 bootops = (struct bootops *)NULL;
2224
2225 #if defined(__xpv)
2226 ec_init_debug_irq();
2227 xs_domu_init();
2228 #endif
2229
2230 #if defined(__amd64) && !defined(__xpv)
2231 /*
2232 * Intel IOMMU has been setup/initialized in ddi_impl.c
2233 * Start it up now.
2234 */
2235 immu_startup();
2236 #endif
2237
2238 PRM_POINT("Enabling interrupts");
2239 (*picinitf)();
2240 sti();
2241 #if defined(__xpv)
2242 ASSERT(CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0);
2243 xen_late_startup();
2244 #endif
2245
2246 (void) add_avsoftintr((void *)&softlevel1_hdl, 1, softlevel1,
2247 "softlevel1", NULL, NULL); /* XXX to be moved later */
2248
2249 /*
2250 * Register software interrupt handlers for ddi_periodic_add(9F).
2251 * Software interrupts up to the level 10 are supported.
2252 */
2253 for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
2254 (void) add_avsoftintr((void *)&softlevel_hdl[i-1], i,
2255 (avfunc)ddi_periodic_softintr, "ddi_periodic",
2256 (caddr_t)(uintptr_t)i, NULL);
2257 }
2258
2259 #if !defined(__xpv)
2260 if (modload("drv", "amd_iommu") < 0) {
2261 PRM_POINT("No AMD IOMMU present\n");
2262 } else if (ddi_hold_installed_driver(ddi_name_to_major(
2263 "amd_iommu")) == NULL) {
2264 prom_printf("ERROR: failed to attach AMD IOMMU\n");
2265 }
2266 #endif
2267 post_startup_cpu_fixups();
2268
2269 PRM_POINT("startup_end() done");
2270 }
2271
2272 /*
2273 * Don't remove the following 2 variables. They are necessary
2274 * for reading the hostid from the legacy file (/kernel/misc/sysinit).
2275 */
2276 char *_hs1107 = hw_serial;
2277 ulong_t _bdhs34;
2278
2279 void
post_startup(void)2280 post_startup(void)
2281 {
2282 extern void cpupm_init(cpu_t *);
2283 extern void cpu_event_init_cpu(cpu_t *);
2284
2285 /*
2286 * Set the system wide, processor-specific flags to be passed
2287 * to userland via the aux vector for performance hints and
2288 * instruction set extensions.
2289 */
2290 bind_hwcap();
2291
2292 #ifdef __xpv
2293 if (DOMAIN_IS_INITDOMAIN(xen_info))
2294 #endif
2295 {
2296 #if defined(__xpv)
2297 xpv_panic_init();
2298 #else
2299 /*
2300 * Startup the memory scrubber.
2301 * XXPV This should be running somewhere ..
2302 */
2303 if ((get_hwenv() & HW_VIRTUAL) == 0)
2304 memscrub_init();
2305 #endif
2306 }
2307
2308 /*
2309 * Complete CPU module initialization
2310 */
2311 cmi_post_startup();
2312
2313 /*
2314 * Perform forceloading tasks for /etc/system.
2315 */
2316 (void) mod_sysctl(SYS_FORCELOAD, NULL);
2317
2318 /*
2319 * ON4.0: Force /proc module in until clock interrupt handle fixed
2320 * ON4.0: This must be fixed or restated in /etc/systems.
2321 */
2322 (void) modload("fs", "procfs");
2323
2324 (void) i_ddi_attach_hw_nodes("pit_beep");
2325
2326 #if defined(__i386)
2327 /*
2328 * Check for required functional Floating Point hardware,
2329 * unless FP hardware explicitly disabled.
2330 */
2331 if (fpu_exists && (fpu_pentium_fdivbug || fp_kind == FP_NO))
2332 halt("No working FP hardware found");
2333 #endif
2334
2335 maxmem = freemem;
2336
2337 cpu_event_init_cpu(CPU);
2338 cpupm_init(CPU);
2339 (void) mach_cpu_create_device_node(CPU, NULL);
2340
2341 pg_init();
2342 }
2343
2344 static int
pp_in_range(page_t * pp,uint64_t low_addr,uint64_t high_addr)2345 pp_in_range(page_t *pp, uint64_t low_addr, uint64_t high_addr)
2346 {
2347 return ((pp->p_pagenum >= btop(low_addr)) &&
2348 (pp->p_pagenum < btopr(high_addr)));
2349 }
2350
2351 static int
pp_in_module(page_t * pp,const rd_existing_t * modranges)2352 pp_in_module(page_t *pp, const rd_existing_t *modranges)
2353 {
2354 uint_t i;
2355
2356 for (i = 0; modranges[i].phys != 0; i++) {
2357 if (pp_in_range(pp, modranges[i].phys,
2358 modranges[i].phys + modranges[i].size))
2359 return (1);
2360 }
2361
2362 return (0);
2363 }
2364
2365 void
release_bootstrap(void)2366 release_bootstrap(void)
2367 {
2368 int root_is_ramdisk;
2369 page_t *pp;
2370 extern void kobj_boot_unmountroot(void);
2371 extern dev_t rootdev;
2372 uint_t i;
2373 char propname[32];
2374 rd_existing_t *modranges;
2375 #if !defined(__xpv)
2376 pfn_t pfn;
2377 #endif
2378
2379 /*
2380 * Save the bootfs module ranges so that we can reserve them below
2381 * for the real bootfs.
2382 */
2383 modranges = kmem_alloc(sizeof (rd_existing_t) * MAX_BOOT_MODULES,
2384 KM_SLEEP);
2385 for (i = 0; ; i++) {
2386 uint64_t start, size;
2387
2388 modranges[i].phys = 0;
2389
2390 (void) snprintf(propname, sizeof (propname),
2391 "module-addr-%u", i);
2392 if (do_bsys_getproplen(NULL, propname) <= 0)
2393 break;
2394 (void) do_bsys_getprop(NULL, propname, &start);
2395
2396 (void) snprintf(propname, sizeof (propname),
2397 "module-size-%u", i);
2398 if (do_bsys_getproplen(NULL, propname) <= 0)
2399 break;
2400 (void) do_bsys_getprop(NULL, propname, &size);
2401
2402 modranges[i].phys = start;
2403 modranges[i].size = size;
2404 }
2405
2406 /* unmount boot ramdisk and release kmem usage */
2407 kobj_boot_unmountroot();
2408
2409 /*
2410 * We're finished using the boot loader so free its pages.
2411 */
2412 PRM_POINT("Unmapping lower boot pages");
2413
2414 clear_boot_mappings(0, _userlimit);
2415
2416 postbootkernelbase = kernelbase;
2417
2418 /*
2419 * If root isn't on ramdisk, destroy the hardcoded
2420 * ramdisk node now and release the memory. Else,
2421 * ramdisk memory is kept in rd_pages.
2422 */
2423 root_is_ramdisk = (getmajor(rootdev) == ddi_name_to_major("ramdisk"));
2424 if (!root_is_ramdisk) {
2425 dev_info_t *dip = ddi_find_devinfo("ramdisk", -1, 0);
2426 ASSERT(dip && ddi_get_parent(dip) == ddi_root_node());
2427 ndi_rele_devi(dip); /* held from ddi_find_devinfo */
2428 (void) ddi_remove_child(dip, 0);
2429 }
2430
2431 PRM_POINT("Releasing boot pages");
2432 while (bootpages) {
2433 extern uint64_t ramdisk_start, ramdisk_end;
2434 pp = bootpages;
2435 bootpages = pp->p_next;
2436
2437
2438 /* Keep pages for the lower 64K */
2439 if (pp_in_range(pp, 0, 0x40000)) {
2440 pp->p_next = lower_pages;
2441 lower_pages = pp;
2442 lower_pages_count++;
2443 continue;
2444 }
2445
2446 if (root_is_ramdisk && pp_in_range(pp, ramdisk_start,
2447 ramdisk_end) || pp_in_module(pp, modranges)) {
2448 pp->p_next = rd_pages;
2449 rd_pages = pp;
2450 continue;
2451 }
2452 pp->p_next = (struct page *)0;
2453 pp->p_prev = (struct page *)0;
2454 PP_CLRBOOTPAGES(pp);
2455 page_free(pp, 1);
2456 }
2457 PRM_POINT("Boot pages released");
2458
2459 kmem_free(modranges, sizeof (rd_existing_t) * 99);
2460
2461 #if !defined(__xpv)
2462 /* XXPV -- note this following bunch of code needs to be revisited in Xen 3.0 */
2463 /*
2464 * Find 1 page below 1 MB so that other processors can boot up or
2465 * so that any processor can resume.
2466 * Make sure it has a kernel VA as well as a 1:1 mapping.
2467 * We should have just free'd one up.
2468 */
2469
2470 /*
2471 * 0x10 pages is 64K. Leave the bottom 64K alone
2472 * for BIOS.
2473 */
2474 for (pfn = 0x10; pfn < btop(1*1024*1024); pfn++) {
2475 if (page_numtopp_alloc(pfn) == NULL)
2476 continue;
2477 rm_platter_va = i86devmap(pfn, 1,
2478 PROT_READ | PROT_WRITE | PROT_EXEC);
2479 rm_platter_pa = ptob(pfn);
2480 break;
2481 }
2482 if (pfn == btop(1*1024*1024) && use_mp)
2483 panic("No page below 1M available for starting "
2484 "other processors or for resuming from system-suspend");
2485 #endif /* !__xpv */
2486 }
2487
2488 /*
2489 * Initialize the platform-specific parts of a page_t.
2490 */
2491 void
add_physmem_cb(page_t * pp,pfn_t pnum)2492 add_physmem_cb(page_t *pp, pfn_t pnum)
2493 {
2494 pp->p_pagenum = pnum;
2495 pp->p_mapping = NULL;
2496 pp->p_embed = 0;
2497 pp->p_share = 0;
2498 pp->p_mlentry = 0;
2499 }
2500
2501 /*
2502 * kphysm_init() initializes physical memory.
2503 */
2504 static pgcnt_t
kphysm_init(page_t * pp,pgcnt_t npages)2505 kphysm_init(
2506 page_t *pp,
2507 pgcnt_t npages)
2508 {
2509 struct memlist *pmem;
2510 struct memseg *cur_memseg;
2511 pfn_t base_pfn;
2512 pfn_t end_pfn;
2513 pgcnt_t num;
2514 pgcnt_t pages_done = 0;
2515 uint64_t addr;
2516 uint64_t size;
2517 extern pfn_t ddiphysmin;
2518 extern int mnode_xwa;
2519 int ms = 0, me = 0;
2520
2521 ASSERT(page_hash != NULL && page_hashsz != 0);
2522
2523 cur_memseg = memseg_base;
2524 for (pmem = phys_avail; pmem && npages; pmem = pmem->ml_next) {
2525 /*
2526 * In a 32 bit kernel can't use higher memory if we're
2527 * not booting in PAE mode. This check takes care of that.
2528 */
2529 addr = pmem->ml_address;
2530 size = pmem->ml_size;
2531 if (btop(addr) > physmax)
2532 continue;
2533
2534 /*
2535 * align addr and size - they may not be at page boundaries
2536 */
2537 if ((addr & MMU_PAGEOFFSET) != 0) {
2538 addr += MMU_PAGEOFFSET;
2539 addr &= ~(uint64_t)MMU_PAGEOFFSET;
2540 size -= addr - pmem->ml_address;
2541 }
2542
2543 /* only process pages below or equal to physmax */
2544 if ((btop(addr + size) - 1) > physmax)
2545 size = ptob(physmax - btop(addr) + 1);
2546
2547 num = btop(size);
2548 if (num == 0)
2549 continue;
2550
2551 if (num > npages)
2552 num = npages;
2553
2554 npages -= num;
2555 pages_done += num;
2556 base_pfn = btop(addr);
2557
2558 if (prom_debug)
2559 prom_printf("MEMSEG addr=0x%" PRIx64
2560 " pgs=0x%lx pfn 0x%lx-0x%lx\n",
2561 addr, num, base_pfn, base_pfn + num);
2562
2563 /*
2564 * Ignore pages below ddiphysmin to simplify ddi memory
2565 * allocation with non-zero addr_lo requests.
2566 */
2567 if (base_pfn < ddiphysmin) {
2568 if (base_pfn + num <= ddiphysmin)
2569 continue;
2570 pp += (ddiphysmin - base_pfn);
2571 num -= (ddiphysmin - base_pfn);
2572 base_pfn = ddiphysmin;
2573 }
2574
2575 /*
2576 * mnode_xwa is greater than 1 when large pages regions can
2577 * cross memory node boundaries. To prevent the formation
2578 * of these large pages, configure the memsegs based on the
2579 * memory node ranges which had been made non-contiguous.
2580 */
2581 if (mnode_xwa > 1) {
2582
2583 end_pfn = base_pfn + num - 1;
2584 ms = PFN_2_MEM_NODE(base_pfn);
2585 me = PFN_2_MEM_NODE(end_pfn);
2586
2587 if (ms != me) {
2588 /*
2589 * current range spans more than 1 memory node.
2590 * Set num to only the pfn range in the start
2591 * memory node.
2592 */
2593 num = mem_node_config[ms].physmax - base_pfn
2594 + 1;
2595 ASSERT(end_pfn > mem_node_config[ms].physmax);
2596 }
2597 }
2598
2599 for (;;) {
2600 /*
2601 * Build the memsegs entry
2602 */
2603 cur_memseg->pages = pp;
2604 cur_memseg->epages = pp + num;
2605 cur_memseg->pages_base = base_pfn;
2606 cur_memseg->pages_end = base_pfn + num;
2607
2608 /*
2609 * Insert into memseg list in decreasing pfn range
2610 * order. Low memory is typically more fragmented such
2611 * that this ordering keeps the larger ranges at the
2612 * front of the list for code that searches memseg.
2613 * This ASSERTS that the memsegs coming in from boot
2614 * are in increasing physical address order and not
2615 * contiguous.
2616 */
2617 if (memsegs != NULL) {
2618 ASSERT(cur_memseg->pages_base >=
2619 memsegs->pages_end);
2620 cur_memseg->next = memsegs;
2621 }
2622 memsegs = cur_memseg;
2623
2624 /*
2625 * add_physmem() initializes the PSM part of the page
2626 * struct by calling the PSM back with add_physmem_cb().
2627 * In addition it coalesces pages into larger pages as
2628 * it initializes them.
2629 */
2630 add_physmem(pp, num, base_pfn);
2631 cur_memseg++;
2632 availrmem_initial += num;
2633 availrmem += num;
2634
2635 pp += num;
2636 if (ms >= me)
2637 break;
2638
2639 /* process next memory node range */
2640 ms++;
2641 base_pfn = mem_node_config[ms].physbase;
2642 num = MIN(mem_node_config[ms].physmax,
2643 end_pfn) - base_pfn + 1;
2644 }
2645 }
2646
2647 PRM_DEBUG(availrmem_initial);
2648 PRM_DEBUG(availrmem);
2649 PRM_DEBUG(freemem);
2650 build_pfn_hash();
2651 return (pages_done);
2652 }
2653
2654 /*
2655 * Kernel VM initialization.
2656 */
2657 static void
kvm_init(void)2658 kvm_init(void)
2659 {
2660 ASSERT((((uintptr_t)s_text) & MMU_PAGEOFFSET) == 0);
2661
2662 /*
2663 * Put the kernel segments in kernel address space.
2664 */
2665 rw_enter(&kas.a_lock, RW_WRITER);
2666 as_avlinit(&kas);
2667
2668 (void) seg_attach(&kas, s_text, e_moddata - s_text, &ktextseg);
2669 (void) segkmem_create(&ktextseg);
2670
2671 (void) seg_attach(&kas, (caddr_t)valloc_base, valloc_sz, &kvalloc);
2672 (void) segkmem_create(&kvalloc);
2673
2674 (void) seg_attach(&kas, kernelheap,
2675 ekernelheap - kernelheap, &kvseg);
2676 (void) segkmem_create(&kvseg);
2677
2678 if (core_size > 0) {
2679 PRM_POINT("attaching kvseg_core");
2680 (void) seg_attach(&kas, (caddr_t)core_base, core_size,
2681 &kvseg_core);
2682 (void) segkmem_create(&kvseg_core);
2683 }
2684
2685 if (segziosize > 0) {
2686 PRM_POINT("attaching segzio");
2687 (void) seg_attach(&kas, segzio_base, mmu_ptob(segziosize),
2688 &kzioseg);
2689 (void) segkmem_zio_create(&kzioseg);
2690
2691 /* create zio area covering new segment */
2692 segkmem_zio_init(segzio_base, mmu_ptob(segziosize));
2693 }
2694
2695 (void) seg_attach(&kas, kdi_segdebugbase, kdi_segdebugsize, &kdebugseg);
2696 (void) segkmem_create(&kdebugseg);
2697
2698 rw_exit(&kas.a_lock);
2699
2700 /*
2701 * Ensure that the red zone at kernelbase is never accessible.
2702 */
2703 PRM_POINT("protecting redzone");
2704 (void) as_setprot(&kas, (caddr_t)kernelbase, KERNEL_REDZONE_SIZE, 0);
2705
2706 /*
2707 * Make the text writable so that it can be hot patched by DTrace.
2708 */
2709 (void) as_setprot(&kas, s_text, e_modtext - s_text,
2710 PROT_READ | PROT_WRITE | PROT_EXEC);
2711
2712 /*
2713 * Make data writable until end.
2714 */
2715 (void) as_setprot(&kas, s_data, e_moddata - s_data,
2716 PROT_READ | PROT_WRITE | PROT_EXEC);
2717 }
2718
2719 #ifndef __xpv
2720 /*
2721 * Solaris adds an entry for Write Combining caching to the PAT
2722 */
2723 static uint64_t pat_attr_reg = PAT_DEFAULT_ATTRIBUTE;
2724
2725 void
pat_sync(void)2726 pat_sync(void)
2727 {
2728 ulong_t cr0, cr0_orig, cr4;
2729
2730 if (!is_x86_feature(x86_featureset, X86FSET_PAT))
2731 return;
2732 cr0_orig = cr0 = getcr0();
2733 cr4 = getcr4();
2734
2735 /* disable caching and flush all caches and TLBs */
2736 cr0 |= CR0_CD;
2737 cr0 &= ~CR0_NW;
2738 setcr0(cr0);
2739 invalidate_cache();
2740 if (cr4 & CR4_PGE) {
2741 setcr4(cr4 & ~(ulong_t)CR4_PGE);
2742 setcr4(cr4);
2743 } else {
2744 reload_cr3();
2745 }
2746
2747 /* add our entry to the PAT */
2748 wrmsr(REG_PAT, pat_attr_reg);
2749
2750 /* flush TLBs and cache again, then reenable cr0 caching */
2751 if (cr4 & CR4_PGE) {
2752 setcr4(cr4 & ~(ulong_t)CR4_PGE);
2753 setcr4(cr4);
2754 } else {
2755 reload_cr3();
2756 }
2757 invalidate_cache();
2758 setcr0(cr0_orig);
2759 }
2760
2761 #endif /* !__xpv */
2762
2763 #if defined(_SOFT_HOSTID)
2764 /*
2765 * On platforms that do not have a hardware serial number, attempt
2766 * to set one based on the contents of /etc/hostid. If this file does
2767 * not exist, assume that we are to generate a new hostid and set
2768 * it in the kernel, for subsequent saving by a userland process
2769 * once the system is up and the root filesystem is mounted r/w.
2770 *
2771 * In order to gracefully support upgrade on OpenSolaris, if
2772 * /etc/hostid does not exist, we will attempt to get a serial number
2773 * using the legacy method (/kernel/misc/sysinit).
2774 *
2775 * If that isn't present, we attempt to use an SMBIOS UUID, which is
2776 * a hardware serial number. Note that we don't automatically trust
2777 * all SMBIOS UUIDs (some older platforms are defective and ship duplicate
2778 * UUIDs in violation of the standard), we check against a blacklist.
2779 *
2780 * In an attempt to make the hostid less prone to abuse
2781 * (for license circumvention, etc), we store it in /etc/hostid
2782 * in rot47 format.
2783 */
2784 extern volatile unsigned long tenmicrodata;
2785 static int atoi(char *);
2786
2787 /*
2788 * Set this to non-zero in /etc/system if you think your SMBIOS returns a
2789 * UUID that is not unique. (Also report it so that the smbios_uuid_blacklist
2790 * array can be updated.)
2791 */
2792 int smbios_broken_uuid = 0;
2793
2794 /*
2795 * List of known bad UUIDs. This is just the lower 32-bit values, since
2796 * that's what we use for the host id. If your hostid falls here, you need
2797 * to contact your hardware OEM for a fix for your BIOS.
2798 */
2799 static unsigned char
2800 smbios_uuid_blacklist[][16] = {
2801
2802 { /* Reported bad UUID (Google search) */
2803 0x00, 0x02, 0x00, 0x03, 0x00, 0x04, 0x00, 0x05,
2804 0x00, 0x06, 0x00, 0x07, 0x00, 0x08, 0x00, 0x09,
2805 },
2806 { /* Known bad DELL UUID */
2807 0x4C, 0x4C, 0x45, 0x44, 0x00, 0x00, 0x20, 0x10,
2808 0x80, 0x20, 0x80, 0xC0, 0x4F, 0x20, 0x20, 0x20,
2809 },
2810 { /* Uninitialized flash */
2811 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
2812 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
2813 },
2814 { /* All zeros */
2815 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
2816 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
2817 },
2818 };
2819
2820 static int32_t
uuid_to_hostid(const uint8_t * uuid)2821 uuid_to_hostid(const uint8_t *uuid)
2822 {
2823 /*
2824 * Although the UUIDs are 128-bits, they may not distribute entropy
2825 * evenly. We would like to use SHA or MD5, but those are located
2826 * in loadable modules and not available this early in boot. As we
2827 * don't need the values to be cryptographically strong, we just
2828 * generate 32-bit vaue by xor'ing the various sequences together,
2829 * which ensures that the entire UUID contributes to the hostid.
2830 */
2831 uint32_t id = 0;
2832
2833 /* first check against the blacklist */
2834 for (int i = 0; i < (sizeof (smbios_uuid_blacklist) / 16); i++) {
2835 if (bcmp(smbios_uuid_blacklist[0], uuid, 16) == 0) {
2836 cmn_err(CE_CONT, "?Broken SMBIOS UUID. "
2837 "Contact BIOS manufacturer for repair.\n");
2838 return ((int32_t)HW_INVALID_HOSTID);
2839 }
2840 }
2841
2842 for (int i = 0; i < 16; i++)
2843 id ^= ((uuid[i]) << (8 * (i % sizeof (id))));
2844
2845 /* Make sure return value is positive */
2846 return (id & 0x7fffffff);
2847 }
2848
2849 static int32_t
set_soft_hostid(void)2850 set_soft_hostid(void)
2851 {
2852 struct _buf *file;
2853 char tokbuf[MAXNAMELEN];
2854 token_t token;
2855 int done = 0;
2856 u_longlong_t tmp;
2857 int i;
2858 int32_t hostid = (int32_t)HW_INVALID_HOSTID;
2859 unsigned char *c;
2860 hrtime_t tsc;
2861 smbios_system_t smsys;
2862
2863 /*
2864 * If /etc/hostid file not found, we'd like to get a pseudo
2865 * random number to use at the hostid. A nice way to do this
2866 * is to read the real time clock. To remain xen-compatible,
2867 * we can't poke the real hardware, so we use tsc_read() to
2868 * read the real time clock. However, there is an ominous
2869 * warning in tsc_read that says it can return zero, so we
2870 * deal with that possibility by falling back to using the
2871 * (hopefully random enough) value in tenmicrodata.
2872 */
2873
2874 if ((file = kobj_open_file(hostid_file)) == (struct _buf *)-1) {
2875 /*
2876 * hostid file not found - try to load sysinit module
2877 * and see if it has a nonzero hostid value...use that
2878 * instead of generating a new hostid here if so.
2879 */
2880 if ((i = modload("misc", "sysinit")) != -1) {
2881 if (strlen(hw_serial) > 0)
2882 hostid = (int32_t)atoi(hw_serial);
2883 (void) modunload(i);
2884 }
2885
2886 /*
2887 * We try to use the SMBIOS UUID. But not if it is blacklisted
2888 * in /etc/system.
2889 */
2890 if ((hostid == HW_INVALID_HOSTID) &&
2891 (smbios_broken_uuid == 0) &&
2892 (ksmbios != NULL) &&
2893 (smbios_info_system(ksmbios, &smsys) != SMB_ERR) &&
2894 (smsys.smbs_uuidlen >= 16)) {
2895 hostid = uuid_to_hostid(smsys.smbs_uuid);
2896 }
2897
2898 /*
2899 * Generate a "random" hostid using the clock. These
2900 * hostids will change on each boot if the value is not
2901 * saved to a persistent /etc/hostid file.
2902 */
2903 if (hostid == HW_INVALID_HOSTID) {
2904 tsc = tsc_read();
2905 if (tsc == 0) /* tsc_read can return zero sometimes */
2906 hostid = (int32_t)tenmicrodata & 0x0CFFFFF;
2907 else
2908 hostid = (int32_t)tsc & 0x0CFFFFF;
2909 }
2910 } else {
2911 /* hostid file found */
2912 while (!done) {
2913 token = kobj_lex(file, tokbuf, sizeof (tokbuf));
2914
2915 switch (token) {
2916 case POUND:
2917 /*
2918 * skip comments
2919 */
2920 kobj_find_eol(file);
2921 break;
2922 case STRING:
2923 /*
2924 * un-rot47 - obviously this
2925 * nonsense is ascii-specific
2926 */
2927 for (c = (unsigned char *)tokbuf;
2928 *c != '\0'; c++) {
2929 *c += 47;
2930 if (*c > '~')
2931 *c -= 94;
2932 else if (*c < '!')
2933 *c += 94;
2934 }
2935 /*
2936 * now we should have a real number
2937 */
2938
2939 if (kobj_getvalue(tokbuf, &tmp) != 0)
2940 kobj_file_err(CE_WARN, file,
2941 "Bad value %s for hostid",
2942 tokbuf);
2943 else
2944 hostid = (int32_t)tmp;
2945
2946 break;
2947 case EOF:
2948 done = 1;
2949 /* FALLTHROUGH */
2950 case NEWLINE:
2951 kobj_newline(file);
2952 break;
2953 default:
2954 break;
2955
2956 }
2957 }
2958 if (hostid == HW_INVALID_HOSTID) /* didn't find a hostid */
2959 kobj_file_err(CE_WARN, file,
2960 "hostid missing or corrupt");
2961
2962 kobj_close_file(file);
2963 }
2964 /*
2965 * hostid is now the value read from /etc/hostid, or the
2966 * new hostid we generated in this routine or HW_INVALID_HOSTID if not
2967 * set.
2968 */
2969 return (hostid);
2970 }
2971
2972 static int
atoi(char * p)2973 atoi(char *p)
2974 {
2975 int i = 0;
2976
2977 while (*p != '\0')
2978 i = 10 * i + (*p++ - '0');
2979
2980 return (i);
2981 }
2982
2983 #endif /* _SOFT_HOSTID */
2984
2985 void
get_system_configuration(void)2986 get_system_configuration(void)
2987 {
2988 char prop[32];
2989 u_longlong_t nodes_ll, cpus_pernode_ll, lvalue;
2990
2991 if (BOP_GETPROPLEN(bootops, "nodes") > sizeof (prop) ||
2992 BOP_GETPROP(bootops, "nodes", prop) < 0 ||
2993 kobj_getvalue(prop, &nodes_ll) == -1 ||
2994 nodes_ll > MAXNODES ||
2995 BOP_GETPROPLEN(bootops, "cpus_pernode") > sizeof (prop) ||
2996 BOP_GETPROP(bootops, "cpus_pernode", prop) < 0 ||
2997 kobj_getvalue(prop, &cpus_pernode_ll) == -1) {
2998 system_hardware.hd_nodes = 1;
2999 system_hardware.hd_cpus_per_node = 0;
3000 } else {
3001 system_hardware.hd_nodes = (int)nodes_ll;
3002 system_hardware.hd_cpus_per_node = (int)cpus_pernode_ll;
3003 }
3004
3005 if (BOP_GETPROPLEN(bootops, "kernelbase") > sizeof (prop) ||
3006 BOP_GETPROP(bootops, "kernelbase", prop) < 0 ||
3007 kobj_getvalue(prop, &lvalue) == -1)
3008 eprom_kernelbase = NULL;
3009 else
3010 eprom_kernelbase = (uintptr_t)lvalue;
3011
3012 if (BOP_GETPROPLEN(bootops, "segmapsize") > sizeof (prop) ||
3013 BOP_GETPROP(bootops, "segmapsize", prop) < 0 ||
3014 kobj_getvalue(prop, &lvalue) == -1)
3015 segmapsize = SEGMAPDEFAULT;
3016 else
3017 segmapsize = (uintptr_t)lvalue;
3018
3019 if (BOP_GETPROPLEN(bootops, "segmapfreelists") > sizeof (prop) ||
3020 BOP_GETPROP(bootops, "segmapfreelists", prop) < 0 ||
3021 kobj_getvalue(prop, &lvalue) == -1)
3022 segmapfreelists = 0; /* use segmap driver default */
3023 else
3024 segmapfreelists = (int)lvalue;
3025
3026 /* physmem used to be here, but moved much earlier to fakebop.c */
3027 }
3028
3029 /*
3030 * Add to a memory list.
3031 * start = start of new memory segment
3032 * len = length of new memory segment in bytes
3033 * new = pointer to a new struct memlist
3034 * memlistp = memory list to which to add segment.
3035 */
3036 void
memlist_add(uint64_t start,uint64_t len,struct memlist * new,struct memlist ** memlistp)3037 memlist_add(
3038 uint64_t start,
3039 uint64_t len,
3040 struct memlist *new,
3041 struct memlist **memlistp)
3042 {
3043 struct memlist *cur;
3044 uint64_t end = start + len;
3045
3046 new->ml_address = start;
3047 new->ml_size = len;
3048
3049 cur = *memlistp;
3050
3051 while (cur) {
3052 if (cur->ml_address >= end) {
3053 new->ml_next = cur;
3054 *memlistp = new;
3055 new->ml_prev = cur->ml_prev;
3056 cur->ml_prev = new;
3057 return;
3058 }
3059 ASSERT(cur->ml_address + cur->ml_size <= start);
3060 if (cur->ml_next == NULL) {
3061 cur->ml_next = new;
3062 new->ml_prev = cur;
3063 new->ml_next = NULL;
3064 return;
3065 }
3066 memlistp = &cur->ml_next;
3067 cur = cur->ml_next;
3068 }
3069 }
3070
3071 void
kobj_vmem_init(vmem_t ** text_arena,vmem_t ** data_arena)3072 kobj_vmem_init(vmem_t **text_arena, vmem_t **data_arena)
3073 {
3074 size_t tsize = e_modtext - modtext;
3075 size_t dsize = e_moddata - moddata;
3076
3077 *text_arena = vmem_create("module_text", tsize ? modtext : NULL, tsize,
3078 1, segkmem_alloc, segkmem_free, heaptext_arena, 0, VM_SLEEP);
3079 *data_arena = vmem_create("module_data", dsize ? moddata : NULL, dsize,
3080 1, segkmem_alloc, segkmem_free, heap32_arena, 0, VM_SLEEP);
3081 }
3082
3083 caddr_t
kobj_text_alloc(vmem_t * arena,size_t size)3084 kobj_text_alloc(vmem_t *arena, size_t size)
3085 {
3086 return (vmem_alloc(arena, size, VM_SLEEP | VM_BESTFIT));
3087 }
3088
3089 /*ARGSUSED*/
3090 caddr_t
kobj_texthole_alloc(caddr_t addr,size_t size)3091 kobj_texthole_alloc(caddr_t addr, size_t size)
3092 {
3093 panic("unexpected call to kobj_texthole_alloc()");
3094 /*NOTREACHED*/
3095 return (0);
3096 }
3097
3098 /*ARGSUSED*/
3099 void
kobj_texthole_free(caddr_t addr,size_t size)3100 kobj_texthole_free(caddr_t addr, size_t size)
3101 {
3102 panic("unexpected call to kobj_texthole_free()");
3103 }
3104
3105 /*
3106 * This is called just after configure() in startup().
3107 *
3108 * The ISALIST concept is a bit hopeless on Intel, because
3109 * there's no guarantee of an ever-more-capable processor
3110 * given that various parts of the instruction set may appear
3111 * and disappear between different implementations.
3112 *
3113 * While it would be possible to correct it and even enhance
3114 * it somewhat, the explicit hardware capability bitmask allows
3115 * more flexibility.
3116 *
3117 * So, we just leave this alone.
3118 */
3119 void
setx86isalist(void)3120 setx86isalist(void)
3121 {
3122 char *tp;
3123 size_t len;
3124 extern char *isa_list;
3125
3126 #define TBUFSIZE 1024
3127
3128 tp = kmem_alloc(TBUFSIZE, KM_SLEEP);
3129 *tp = '\0';
3130
3131 #if defined(__amd64)
3132 (void) strcpy(tp, "amd64 ");
3133 #endif
3134
3135 switch (x86_vendor) {
3136 case X86_VENDOR_Intel:
3137 case X86_VENDOR_AMD:
3138 case X86_VENDOR_TM:
3139 if (is_x86_feature(x86_featureset, X86FSET_CMOV)) {
3140 /*
3141 * Pentium Pro or later
3142 */
3143 (void) strcat(tp, "pentium_pro");
3144 (void) strcat(tp,
3145 is_x86_feature(x86_featureset, X86FSET_MMX) ?
3146 "+mmx pentium_pro " : " ");
3147 }
3148 /*FALLTHROUGH*/
3149 case X86_VENDOR_Cyrix:
3150 /*
3151 * The Cyrix 6x86 does not have any Pentium features
3152 * accessible while not at privilege level 0.
3153 */
3154 if (is_x86_feature(x86_featureset, X86FSET_CPUID)) {
3155 (void) strcat(tp, "pentium");
3156 (void) strcat(tp,
3157 is_x86_feature(x86_featureset, X86FSET_MMX) ?
3158 "+mmx pentium " : " ");
3159 }
3160 break;
3161 default:
3162 break;
3163 }
3164 (void) strcat(tp, "i486 i386 i86");
3165 len = strlen(tp) + 1; /* account for NULL at end of string */
3166 isa_list = strcpy(kmem_alloc(len, KM_SLEEP), tp);
3167 kmem_free(tp, TBUFSIZE);
3168
3169 #undef TBUFSIZE
3170 }
3171
3172
3173 #ifdef __amd64
3174
3175 void *
device_arena_alloc(size_t size,int vm_flag)3176 device_arena_alloc(size_t size, int vm_flag)
3177 {
3178 return (vmem_alloc(device_arena, size, vm_flag));
3179 }
3180
3181 void
device_arena_free(void * vaddr,size_t size)3182 device_arena_free(void *vaddr, size_t size)
3183 {
3184 vmem_free(device_arena, vaddr, size);
3185 }
3186
3187 #else /* __i386 */
3188
3189 void *
device_arena_alloc(size_t size,int vm_flag)3190 device_arena_alloc(size_t size, int vm_flag)
3191 {
3192 caddr_t vaddr;
3193 uintptr_t v;
3194 size_t start;
3195 size_t end;
3196
3197 vaddr = vmem_alloc(heap_arena, size, vm_flag);
3198 if (vaddr == NULL)
3199 return (NULL);
3200
3201 v = (uintptr_t)vaddr;
3202 ASSERT(v >= kernelbase);
3203 ASSERT(v + size <= valloc_base);
3204
3205 start = btop(v - kernelbase);
3206 end = btop(v + size - 1 - kernelbase);
3207 ASSERT(start < toxic_bit_map_len);
3208 ASSERT(end < toxic_bit_map_len);
3209
3210 while (start <= end) {
3211 BT_ATOMIC_SET(toxic_bit_map, start);
3212 ++start;
3213 }
3214 return (vaddr);
3215 }
3216
3217 void
device_arena_free(void * vaddr,size_t size)3218 device_arena_free(void *vaddr, size_t size)
3219 {
3220 uintptr_t v = (uintptr_t)vaddr;
3221 size_t start;
3222 size_t end;
3223
3224 ASSERT(v >= kernelbase);
3225 ASSERT(v + size <= valloc_base);
3226
3227 start = btop(v - kernelbase);
3228 end = btop(v + size - 1 - kernelbase);
3229 ASSERT(start < toxic_bit_map_len);
3230 ASSERT(end < toxic_bit_map_len);
3231
3232 while (start <= end) {
3233 ASSERT(BT_TEST(toxic_bit_map, start) != 0);
3234 BT_ATOMIC_CLEAR(toxic_bit_map, start);
3235 ++start;
3236 }
3237 vmem_free(heap_arena, vaddr, size);
3238 }
3239
3240 /*
3241 * returns 1st address in range that is in device arena, or NULL
3242 * if len is not NULL it returns the length of the toxic range
3243 */
3244 void *
device_arena_contains(void * vaddr,size_t size,size_t * len)3245 device_arena_contains(void *vaddr, size_t size, size_t *len)
3246 {
3247 uintptr_t v = (uintptr_t)vaddr;
3248 uintptr_t eaddr = v + size;
3249 size_t start;
3250 size_t end;
3251
3252 /*
3253 * if called very early by kmdb, just return NULL
3254 */
3255 if (toxic_bit_map == NULL)
3256 return (NULL);
3257
3258 /*
3259 * First check if we're completely outside the bitmap range.
3260 */
3261 if (v >= valloc_base || eaddr < kernelbase)
3262 return (NULL);
3263
3264 /*
3265 * Trim ends of search to look at only what the bitmap covers.
3266 */
3267 if (v < kernelbase)
3268 v = kernelbase;
3269 start = btop(v - kernelbase);
3270 end = btop(eaddr - kernelbase);
3271 if (end >= toxic_bit_map_len)
3272 end = toxic_bit_map_len;
3273
3274 if (bt_range(toxic_bit_map, &start, &end, end) == 0)
3275 return (NULL);
3276
3277 v = kernelbase + ptob(start);
3278 if (len != NULL)
3279 *len = ptob(end - start);
3280 return ((void *)v);
3281 }
3282
3283 #endif /* __i386 */
3284