1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /* Copyright (c) 1988 AT&T */
27 /* All Rights Reserved */
28
29 /*
30 * Copyright 2019 Joyent, Inc.
31 */
32
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/sysmacros.h>
36 #include <sys/pcb.h>
37 #include <sys/systm.h>
38 #include <sys/signal.h>
39 #include <sys/cred.h>
40 #include <sys/user.h>
41 #include <sys/vfs.h>
42 #include <sys/vnode.h>
43 #include <sys/proc.h>
44 #include <sys/time.h>
45 #include <sys/file.h>
46 #include <sys/priocntl.h>
47 #include <sys/procset.h>
48 #include <sys/disp.h>
49 #include <sys/callo.h>
50 #include <sys/callb.h>
51 #include <sys/debug.h>
52 #include <sys/conf.h>
53 #include <sys/bootconf.h>
54 #include <sys/utsname.h>
55 #include <sys/cmn_err.h>
56 #include <sys/vmparam.h>
57 #include <sys/modctl.h>
58 #include <sys/vm.h>
59 #include <sys/callb.h>
60 #include <sys/ddi_periodic.h>
61 #include <sys/kmem.h>
62 #include <sys/vmem.h>
63 #include <sys/cpuvar.h>
64 #include <sys/cladm.h>
65 #include <sys/corectl.h>
66 #include <sys/exec.h>
67 #include <sys/syscall.h>
68 #include <sys/reboot.h>
69 #include <sys/task.h>
70 #include <sys/exacct.h>
71 #include <sys/autoconf.h>
72 #include <sys/errorq.h>
73 #include <sys/class.h>
74 #include <sys/stack.h>
75 #include <sys/brand.h>
76 #include <sys/mmapobj.h>
77 #include <sys/smt.h>
78
79 #include <vm/as.h>
80 #include <vm/seg_kmem.h>
81 #include <sys/dc_ki.h>
82
83 #include <c2/audit.h>
84 #include <sys/bootprops.h>
85
86 /* well known processes */
87 proc_t *proc_sched; /* memory scheduler */
88 proc_t *proc_init; /* init */
89 proc_t *proc_pageout; /* pageout daemon */
90 proc_t *proc_fsflush; /* fsflush daemon */
91
92 pgcnt_t maxmem; /* Maximum available memory in pages. */
93 pgcnt_t freemem; /* Current available memory in pages. */
94 int interrupts_unleashed; /* set when we do the first spl0() */
95
96 kmem_cache_t *process_cache; /* kmem cache for proc structures */
97
98 /*
99 * Indicates whether the auditing module (c2audit) is loaded. Possible
100 * values are:
101 * 0 - c2audit module is excluded in /etc/system and cannot be loaded
102 * 1 - c2audit module is not loaded but can be anytime
103 * 2 - c2audit module is loaded
104 */
105 int audit_active = C2AUDIT_DISABLED;
106
107 /*
108 * Process 0's lwp directory and lwpid hash table.
109 */
110 lwpdir_t p0_lwpdir[2];
111 tidhash_t p0_tidhash[2];
112 lwpent_t p0_lep;
113
114 /*
115 * Machine-independent initialization code
116 * Called from cold start routine as
117 * soon as a stack and segmentation
118 * have been established.
119 * Functions:
120 * clear and free user core
121 * turn on clock
122 * hand craft 0th process
123 * call all initialization routines
124 * fork - process 0 to schedule
125 * - process 1 execute bootstrap
126 * - process 2 to page out
127 * create system threads
128 */
129
130 int cluster_bootflags = 0;
131
132 void
cluster_wrapper(void)133 cluster_wrapper(void)
134 {
135 cluster();
136 panic("cluster() returned");
137 }
138
139 char initname[INITNAME_SZ] = "/sbin/init"; /* also referenced by zone0 */
140 char initargs[BOOTARGS_MAX] = ""; /* also referenced by zone0 */
141
142 /*
143 * Construct a stack for init containing the arguments to it, then
144 * pass control to exec_common.
145 */
146 int
exec_init(const char * initpath,const char * args)147 exec_init(const char *initpath, const char *args)
148 {
149 uintptr_t ucp;
150 uintptr_t uap;
151 uintptr_t *argv;
152 uintptr_t exec_fnamep;
153 char *scratchargs;
154 int i, sarg;
155 size_t argvlen, alen;
156 size_t wlen = sizeof (uintptr_t);
157 boolean_t in_arg;
158 int argc = 0;
159 int error = 0, count = 0;
160 proc_t *p = ttoproc(curthread);
161 klwp_t *lwp = ttolwp(curthread);
162 int brand_action;
163
164 if (args == NULL)
165 args = "";
166
167 alen = strlen(initpath) + 1 + strlen(args) + 1;
168 scratchargs = kmem_alloc(alen, KM_SLEEP);
169 (void) snprintf(scratchargs, alen, "%s %s", initpath, args);
170
171 /*
172 * We do a quick two state parse of the string to sort out how big
173 * argc should be.
174 */
175 in_arg = B_FALSE;
176 for (i = 0; i < strlen(scratchargs); i++) {
177 if (scratchargs[i] == ' ' || scratchargs[i] == '\0') {
178 if (in_arg) {
179 in_arg = B_FALSE;
180 argc++;
181 }
182 } else {
183 in_arg = B_TRUE;
184 }
185 }
186 argvlen = sizeof (uintptr_t) * (argc + 1);
187 argv = kmem_zalloc(argvlen, KM_SLEEP);
188
189 /*
190 * We pull off a bit of a hack here. We work our way through the
191 * args string, putting nulls at the ends of space delimited tokens
192 * (boot args don't support quoting at this time). Then we just
193 * copy the whole mess to userland in one go. In other words, we
194 * transform this: "init -s -r\0" into this on the stack:
195 *
196 * -0x00 \0
197 * -0x01 r
198 * -0x02 - <--------.
199 * -0x03 \0 |
200 * -0x04 s |
201 * -0x05 - <------. |
202 * -0x06 \0 | |
203 * -0x07 t | |
204 * -0x08 i | |
205 * -0x09 n | |
206 * -0x0a i <---. | |
207 * -0x10 NULL | | | (argv[3])
208 * -0x14 -----|--|-' (argv[2])
209 * -0x18 ------|--' (argv[1])
210 * -0x1c -------' (argv[0])
211 *
212 * Since we know the value of ucp at the beginning of this process,
213 * we can trivially compute the argv[] array which we also need to
214 * place in userland: argv[i] = ucp - sarg(i), where ucp is the
215 * stack ptr, and sarg is the string index of the start of the
216 * argument.
217 */
218 ucp = (uintptr_t)p->p_usrstack;
219
220 argc = 0;
221 in_arg = B_FALSE;
222 sarg = 0;
223
224 for (i = 0; i < alen; i++) {
225 if (scratchargs[i] == ' ' || scratchargs[i] == '\0') {
226 if (in_arg == B_TRUE) {
227 in_arg = B_FALSE;
228 scratchargs[i] = '\0';
229 argv[argc++] = ucp - (alen - sarg);
230 }
231 } else if (in_arg == B_FALSE) {
232 in_arg = B_TRUE;
233 sarg = i;
234 }
235 }
236
237 exec_fnamep = argv[0];
238
239 ucp -= alen;
240 error |= copyout(scratchargs, (caddr_t)ucp, alen);
241
242 if (p->p_model == DATAMODEL_ILP32) {
243 uintptr32_t *argv32;
244
245 argv32 = kmem_zalloc(argvlen / 2, KM_SLEEP);
246
247 for (i = 0; i < argc; i++)
248 argv32[i] = (uintptr32_t)argv[i];
249
250 kmem_free(argv, argvlen);
251 argv = (uintptr_t *)argv32;
252 argvlen /= 2;
253
254 wlen = sizeof (uintptr32_t);
255 }
256
257 uap = P2ALIGN(ucp, wlen);
258 /* advance to be below the word we're in */
259 uap -= wlen;
260 /* advance argc words down, plus one for NULL */
261 uap -= (argc + 1) * wlen;
262 error |= copyout(argv, (caddr_t)uap, argvlen);
263
264 if (error != 0) {
265 zcmn_err(p->p_zone->zone_id, CE_WARN,
266 "Could not construct stack for init.\n");
267 kmem_free(argv, argvlen);
268 kmem_free(scratchargs, alen);
269 return (EFAULT);
270 }
271
272 kmem_free(argv, argvlen);
273 kmem_free(scratchargs, alen);
274
275 /*
276 * Point at the arguments.
277 */
278 lwp->lwp_ap = lwp->lwp_arg;
279 lwp->lwp_arg[0] = exec_fnamep;
280 lwp->lwp_arg[1] = uap;
281 lwp->lwp_arg[2] = 0;
282 curthread->t_post_sys = 1;
283 curthread->t_sysnum = SYS_execve;
284
285 /*
286 * If we are executing init from zsched, we may have inherited its
287 * parent process's signal mask. Clear it now so that we behave in
288 * the same way as when started from the global zone.
289 */
290 sigemptyset(&curthread->t_hold);
291
292 brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE;
293 again:
294 error = exec_common((const char *)exec_fnamep,
295 (const char **)uap, NULL, NULL, brand_action);
296
297 /*
298 * Normally we would just set lwp_argsaved and t_post_sys and
299 * let post_syscall reset lwp_ap for us. Unfortunately,
300 * exec_init isn't always called from a system call. Instead
301 * of making a mess of trap_cleanup, we just reset the args
302 * pointer here.
303 */
304 reset_syscall_args();
305
306 switch (error) {
307 case 0:
308 return (0);
309
310 case ENOENT:
311 zcmn_err(p->p_zone->zone_id, CE_WARN,
312 "exec(%s) failed (file not found).\n", initpath);
313 return (ENOENT);
314
315 case EAGAIN:
316 case EINTR:
317 ++count;
318 if (count < 5) {
319 zcmn_err(p->p_zone->zone_id, CE_WARN,
320 "exec(%s) failed with errno %d. Retrying...\n",
321 initpath, error);
322 goto again;
323 }
324 }
325
326 zcmn_err(p->p_zone->zone_id, CE_WARN,
327 "exec(%s) failed with errno %d.", initpath, error);
328 return (error);
329 }
330
331 /*
332 * This routine does all of the common setup for invoking init; global
333 * and non-global zones employ this routine for the functionality which is
334 * in common.
335 *
336 * This program (init, presumably) must be a 32-bit process.
337 */
338 int
start_init_common()339 start_init_common()
340 {
341 proc_t *p = curproc;
342 ASSERT_STACK_ALIGNED();
343 p->p_zone->zone_proc_initpid = p->p_pid;
344
345 p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0;
346 p->p_usrstack = (caddr_t)USRSTACK32;
347 p->p_model = DATAMODEL_ILP32;
348 p->p_stkprot = PROT_ZFOD & ~PROT_EXEC;
349 p->p_datprot = PROT_ZFOD & ~PROT_EXEC;
350 p->p_stk_ctl = INT32_MAX;
351
352 p->p_as = as_alloc();
353 p->p_as->a_proc = p;
354 p->p_as->a_userlimit = (caddr_t)USERLIMIT32;
355 (void) hat_setup(p->p_as->a_hat, HAT_INIT);
356
357 init_core();
358
359 init_mstate(curthread, LMS_SYSTEM);
360 return (exec_init(p->p_zone->zone_initname, p->p_zone->zone_bootargs));
361 }
362
363 /*
364 * Start the initial user process for the global zone; once running, if
365 * init should subsequently fail, it will be automatically be caught in the
366 * exit(2) path, and restarted by restart_init().
367 */
368 static void
start_init(void)369 start_init(void)
370 {
371 proc_init = curproc;
372
373 ASSERT(curproc->p_zone->zone_initname != NULL);
374
375 if (start_init_common() != 0)
376 halt("unix: Could not start init");
377 lwp_rtt();
378 }
379
380 void
main(void)381 main(void)
382 {
383 proc_t *p = ttoproc(curthread); /* &p0 */
384 int (**initptr)();
385 extern void sched();
386 extern void fsflush();
387 extern int (*init_tbl[])();
388 extern int (*mp_init_tbl[])();
389 extern id_t syscid, defaultcid;
390 extern int swaploaded;
391 extern int netboot;
392 extern ib_boot_prop_t *iscsiboot_prop;
393 extern void vm_init(void);
394 extern void cbe_init_pre(void);
395 extern void cbe_init(void);
396 extern void clock_tick_init_pre(void);
397 extern void clock_tick_init_post(void);
398 extern void clock_init(void);
399 extern void physio_bufs_init(void);
400 extern void pm_cfb_setup_intr(void);
401 extern int pm_adjust_timestamps(dev_info_t *, void *);
402 extern void start_other_cpus(int);
403 extern void sysevent_evc_thrinit();
404 extern kmutex_t ualock;
405 #if defined(__x86)
406 extern void fastboot_post_startup(void);
407 extern void progressbar_start(void);
408 #endif
409 /*
410 * In the horrible world of x86 in-lines, you can't get symbolic
411 * structure offsets a la genassym. This assertion is here so
412 * that the next poor slob who innocently changes the offset of
413 * cpu_thread doesn't waste as much time as I just did finding
414 * out that it's hard-coded in i86/ml/i86.il. Similarly for
415 * curcpup. You're welcome.
416 */
417 ASSERT(CPU == CPU->cpu_self);
418 ASSERT(curthread == CPU->cpu_thread);
419 ASSERT_STACK_ALIGNED();
420
421 /*
422 * We take the ualock until we have completed the startup
423 * to prevent kadmin() from disrupting this work. In particular,
424 * we don't want kadmin() to bring the system down while we are
425 * trying to start it up.
426 */
427 mutex_enter(&ualock);
428
429 /*
430 * Setup root lgroup and leaf lgroup for CPU 0
431 */
432 lgrp_init(LGRP_INIT_STAGE2);
433
434 /*
435 * Once 'startup()' completes, the thread_reaper() daemon would be
436 * created(in thread_init()). After that, it is safe to create threads
437 * that could exit. These exited threads will get reaped.
438 */
439 startup();
440 segkmem_gc();
441 callb_init();
442 cbe_init_pre(); /* x86 must initialize gethrtimef before timer_init */
443 ddi_periodic_init();
444 cbe_init();
445 callout_init(); /* callout table MUST be init'd after cyclics */
446 clock_tick_init_pre();
447 clock_init();
448
449 #if defined(__x86)
450 /*
451 * The progressbar thread uses cv_reltimedwait() and hence needs to be
452 * started after the callout mechanism has been initialized.
453 */
454 progressbar_start();
455 #endif
456 /*
457 * On some platforms, clkinitf() changes the timing source that
458 * gethrtime_unscaled() uses to generate timestamps. cbe_init() calls
459 * clkinitf(), so re-initialize the microstate counters after the
460 * timesource has been chosen.
461 */
462 init_mstate(&t0, LMS_SYSTEM);
463 init_cpu_mstate(CPU, CMS_SYSTEM);
464
465 /*
466 * May need to probe to determine latencies from CPU 0 after
467 * gethrtime() comes alive in cbe_init() and before enabling interrupts
468 * and copy and release any temporary memory allocated with BOP_ALLOC()
469 * before release_bootstrap() frees boot memory
470 */
471 lgrp_init(LGRP_INIT_STAGE3);
472
473 /*
474 * Call all system initialization functions.
475 */
476 for (initptr = &init_tbl[0]; *initptr; initptr++)
477 (**initptr)();
478 /*
479 * Load iSCSI boot properties
480 */
481 ld_ib_prop();
482 /*
483 * initialize vm related stuff.
484 */
485 vm_init();
486
487 /*
488 * initialize buffer pool for raw I/O requests
489 */
490 physio_bufs_init();
491
492 ttolwp(curthread)->lwp_error = 0; /* XXX kludge for SCSI driver */
493
494 /*
495 * Drop the interrupt level and allow interrupts. At this point
496 * the DDI guarantees that interrupts are enabled.
497 */
498 (void) spl0();
499 interrupts_unleashed = 1;
500
501 /*
502 * Create kmem cache for proc structures
503 */
504 process_cache = kmem_cache_create("process_cache", sizeof (proc_t),
505 0, NULL, NULL, NULL, NULL, NULL, 0);
506
507 vfs_mountroot(); /* Mount the root file system */
508 errorq_init(); /* after vfs_mountroot() so DDI root is ready */
509 cpu_kstat_init(CPU); /* after vfs_mountroot() so TOD is valid */
510 ddi_walk_devs(ddi_root_node(), pm_adjust_timestamps, NULL);
511 /* after vfs_mountroot() so hrestime is valid */
512
513 post_startup();
514 swaploaded = 1;
515
516 /*
517 * Initialize Solaris Audit Subsystem
518 */
519 audit_init();
520
521 /*
522 * Start the periodic hash rescale for all vmem arenas before we load
523 * protocol modules and drivers via strplumb() below. Some drivers
524 * might rely on heavy vmem operations that could hurt performance
525 * without the rescale.
526 */
527 vmem_update(NULL);
528
529 /*
530 * Plumb the protocol modules and drivers only if we are not
531 * networked booted, in this case we already did it in rootconf().
532 */
533 if (netboot == 0 && iscsiboot_prop == NULL)
534 (void) strplumb();
535
536 gethrestime(&PTOU(curproc)->u_start);
537 curthread->t_start = PTOU(curproc)->u_start.tv_sec;
538 p->p_mstart = gethrtime();
539
540 /*
541 * Perform setup functions that can only be done after root
542 * and swap have been set up.
543 */
544 consconfig();
545 #ifndef __sparc
546 release_bootstrap();
547 #endif
548
549 /*
550 * attach drivers with ddi-forceattach prop
551 * It must be done early enough to load hotplug drivers (e.g.
552 * pcmcia nexus) so that devices enumerated via hotplug is
553 * available before I/O subsystem is fully initialized.
554 */
555 i_ddi_forceattach_drivers();
556
557 /*
558 * Set the scan rate and other parameters of the paging subsystem.
559 */
560 setupclock();
561
562 /*
563 * Initialize process 0's lwp directory and lwpid hash table.
564 */
565 p->p_lwpdir = p->p_lwpfree = p0_lwpdir;
566 p->p_lwpdir->ld_next = p->p_lwpdir + 1;
567 p->p_lwpdir_sz = 2;
568 p->p_tidhash = p0_tidhash;
569 p->p_tidhash_sz = 2;
570 p0_lep.le_thread = curthread;
571 p0_lep.le_lwpid = curthread->t_tid;
572 p0_lep.le_start = curthread->t_start;
573 lwp_hash_in(p, &p0_lep, p0_tidhash, 2, 0);
574
575 /*
576 * Initialize extended accounting.
577 */
578 exacct_init();
579
580 /*
581 * Initialize threads of sysevent event channels
582 */
583 sysevent_evc_thrinit();
584
585 /*
586 * This must be done after post_startup() but before
587 * start_other_cpus()
588 */
589 lgrp_init(LGRP_INIT_STAGE4);
590
591 /*
592 * Perform MP initialization, if any.
593 */
594 start_other_cpus(0);
595
596 #ifdef __sparc
597 /*
598 * Release bootstrap here since PROM interfaces are
599 * used to start other CPUs above.
600 */
601 release_bootstrap();
602 #endif
603
604 /*
605 * Finish lgrp initialization after all CPUS are brought online.
606 */
607 lgrp_init(LGRP_INIT_STAGE5);
608
609 /*
610 * After mp_init(), number of cpus are known (this is
611 * true for the time being, when there are actually
612 * hot pluggable cpus then this scheme would not do).
613 * Any per cpu initialization is done here.
614 */
615 kmem_mp_init();
616
617 clock_tick_init_post();
618
619 for (initptr = &mp_init_tbl[0]; *initptr; initptr++)
620 (**initptr)();
621
622 /*
623 * These must be called after start_other_cpus
624 */
625 pm_cfb_setup_intr();
626 #if defined(__x86)
627 fastboot_post_startup();
628
629 smt_late_init();
630 #endif
631
632 /*
633 * Make init process; enter scheduling loop with system process.
634 *
635 * Note that we manually assign the pids for these processes, for
636 * historical reasons. If more pre-assigned pids are needed,
637 * FAMOUS_PIDS will have to be updated.
638 */
639
640 /* create init process */
641 if (newproc(start_init, NULL, defaultcid, 59, NULL,
642 FAMOUS_PID_INIT))
643 panic("main: unable to fork init.");
644
645 /* create pageout daemon */
646 if (newproc(pageout, NULL, syscid, maxclsyspri - 1, NULL,
647 FAMOUS_PID_PAGEOUT))
648 panic("main: unable to fork pageout()");
649
650 /* create fsflush daemon */
651 if (newproc(fsflush, NULL, syscid, minclsyspri, NULL,
652 FAMOUS_PID_FSFLUSH))
653 panic("main: unable to fork fsflush()");
654
655 /* create cluster process if we're a member of one */
656 if (cluster_bootflags & CLUSTER_BOOTED) {
657 if (newproc(cluster_wrapper, NULL, syscid, minclsyspri,
658 NULL, 0)) {
659 panic("main: unable to fork cluster()");
660 }
661 }
662
663 /*
664 * Create system threads (threads are associated with p0)
665 */
666
667 /* create module uninstall daemon */
668 /* BugID 1132273. If swapping over NFS need a bigger stack */
669 (void) thread_create(NULL, 0, (void (*)())mod_uninstall_daemon,
670 NULL, 0, &p0, TS_RUN, minclsyspri);
671
672 (void) thread_create(NULL, 0, seg_pasync_thread,
673 NULL, 0, &p0, TS_RUN, minclsyspri);
674
675 pid_setmin();
676
677 /* system is now ready */
678 mutex_exit(&ualock);
679
680 bcopy("sched", PTOU(curproc)->u_psargs, 6);
681 bcopy("sched", PTOU(curproc)->u_comm, 5);
682 sched();
683 /* NOTREACHED */
684 }
685