xref: /illumos-gate/usr/src/uts/common/os/main.c (revision 9b664393d4fdda96221e6ea9ea95790d3c15be70)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*	Copyright (c) 1988 AT&T	*/
27 /*	  All Rights Reserved		*/
28 
29 /*
30  * Copyright 2019 Joyent, Inc.
31  */
32 
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/sysmacros.h>
36 #include <sys/pcb.h>
37 #include <sys/systm.h>
38 #include <sys/signal.h>
39 #include <sys/cred.h>
40 #include <sys/user.h>
41 #include <sys/vfs.h>
42 #include <sys/vnode.h>
43 #include <sys/proc.h>
44 #include <sys/time.h>
45 #include <sys/file.h>
46 #include <sys/priocntl.h>
47 #include <sys/procset.h>
48 #include <sys/disp.h>
49 #include <sys/callo.h>
50 #include <sys/callb.h>
51 #include <sys/debug.h>
52 #include <sys/conf.h>
53 #include <sys/bootconf.h>
54 #include <sys/utsname.h>
55 #include <sys/cmn_err.h>
56 #include <sys/vmparam.h>
57 #include <sys/modctl.h>
58 #include <sys/vm.h>
59 #include <sys/callb.h>
60 #include <sys/ddi_periodic.h>
61 #include <sys/kmem.h>
62 #include <sys/vmem.h>
63 #include <sys/cpuvar.h>
64 #include <sys/cladm.h>
65 #include <sys/corectl.h>
66 #include <sys/exec.h>
67 #include <sys/syscall.h>
68 #include <sys/reboot.h>
69 #include <sys/task.h>
70 #include <sys/exacct.h>
71 #include <sys/autoconf.h>
72 #include <sys/errorq.h>
73 #include <sys/class.h>
74 #include <sys/stack.h>
75 #include <sys/brand.h>
76 #include <sys/mmapobj.h>
77 #include <sys/smt.h>
78 
79 #include <vm/as.h>
80 #include <vm/seg_kmem.h>
81 #include <sys/dc_ki.h>
82 
83 #include <c2/audit.h>
84 #include <sys/bootprops.h>
85 
86 /* well known processes */
87 proc_t *proc_sched;		/* memory scheduler */
88 proc_t *proc_init;		/* init */
89 proc_t *proc_pageout;		/* pageout daemon */
90 proc_t *proc_fsflush;		/* fsflush daemon */
91 
92 pgcnt_t	maxmem;		/* Maximum available memory in pages.	*/
93 pgcnt_t	freemem;	/* Current available memory in pages.	*/
94 int	interrupts_unleashed;	/* set when we do the first spl0() */
95 
96 kmem_cache_t *process_cache;	/* kmem cache for proc structures */
97 
98 /*
99  * Indicates whether the auditing module (c2audit) is loaded. Possible
100  * values are:
101  * 0 - c2audit module is excluded in /etc/system and cannot be loaded
102  * 1 - c2audit module is not loaded but can be anytime
103  * 2 - c2audit module is loaded
104  */
105 int audit_active = C2AUDIT_DISABLED;
106 
107 /*
108  * Process 0's lwp directory and lwpid hash table.
109  */
110 lwpdir_t p0_lwpdir[2];
111 tidhash_t p0_tidhash[2];
112 lwpent_t p0_lep;
113 
114 /*
115  * Machine-independent initialization code
116  * Called from cold start routine as
117  * soon as a stack and segmentation
118  * have been established.
119  * Functions:
120  *	clear and free user core
121  *	turn on clock
122  *	hand craft 0th process
123  *	call all initialization routines
124  *	fork	- process 0 to schedule
125  *		- process 1 execute bootstrap
126  *		- process 2 to page out
127  *	create system threads
128  */
129 
130 int cluster_bootflags = 0;
131 
132 void
133 cluster_wrapper(void)
134 {
135 	cluster();
136 	panic("cluster()  returned");
137 }
138 
139 char initname[INITNAME_SZ] = "/sbin/init";	/* also referenced by zone0 */
140 char initargs[BOOTARGS_MAX] = "";		/* also referenced by zone0 */
141 
142 /*
143  * Construct a stack for init containing the arguments to it, then
144  * pass control to exec_common.
145  */
146 int
147 exec_init(const char *initpath, const char *args)
148 {
149 	uintptr_t ucp;
150 	uintptr_t uap;
151 	uintptr_t *argv;
152 	uintptr_t exec_fnamep;
153 	char *scratchargs;
154 	int i, sarg;
155 	size_t argvlen, alen;
156 	size_t wlen = sizeof (uintptr_t);
157 	boolean_t in_arg;
158 	int argc = 0;
159 	int error = 0, count = 0;
160 	proc_t *p = ttoproc(curthread);
161 	klwp_t *lwp = ttolwp(curthread);
162 	int brand_action;
163 
164 	if (args == NULL)
165 		args = "";
166 
167 	alen = strlen(initpath) + 1 + strlen(args) + 1;
168 	scratchargs = kmem_alloc(alen, KM_SLEEP);
169 	(void) snprintf(scratchargs, alen, "%s %s", initpath, args);
170 
171 	/*
172 	 * We do a quick two state parse of the string to sort out how big
173 	 * argc should be.
174 	 */
175 	in_arg = B_FALSE;
176 	for (i = 0; i < strlen(scratchargs); i++) {
177 		if (scratchargs[i] == ' ' || scratchargs[i] == '\0') {
178 			if (in_arg) {
179 				in_arg = B_FALSE;
180 				argc++;
181 			}
182 		} else {
183 			in_arg = B_TRUE;
184 		}
185 	}
186 	argvlen = sizeof (uintptr_t) * (argc + 1);
187 	argv = kmem_zalloc(argvlen, KM_SLEEP);
188 
189 	/*
190 	 * We pull off a bit of a hack here.  We work our way through the
191 	 * args string, putting nulls at the ends of space delimited tokens
192 	 * (boot args don't support quoting at this time).  Then we just
193 	 * copy the whole mess to userland in one go.  In other words, we
194 	 * transform this: "init -s -r\0" into this on the stack:
195 	 *
196 	 *	-0x00 \0
197 	 *	-0x01 r
198 	 *	-0x02 -  <--------.
199 	 *	-0x03 \0	  |
200 	 *	-0x04 s		  |
201 	 *	-0x05 -  <------. |
202 	 *	-0x06 \0	| |
203 	 *	-0x07 t		| |
204 	 *	-0x08 i		| |
205 	 *	-0x09 n		| |
206 	 *	-0x0a i  <---.  | |
207 	 *	-0x10 NULL   |  | |	(argv[3])
208 	 *	-0x14   -----|--|-'	(argv[2])
209 	 *	-0x18  ------|--'	(argv[1])
210 	 *	-0x1c -------'		(argv[0])
211 	 *
212 	 * Since we know the value of ucp at the beginning of this process,
213 	 * we can trivially compute the argv[] array which we also need to
214 	 * place in userland: argv[i] = ucp - sarg(i), where ucp is the
215 	 * stack ptr, and sarg is the string index of the start of the
216 	 * argument.
217 	 */
218 	ucp = (uintptr_t)p->p_usrstack;
219 
220 	argc = 0;
221 	in_arg = B_FALSE;
222 	sarg = 0;
223 
224 	for (i = 0; i < alen; i++) {
225 		if (scratchargs[i] == ' ' || scratchargs[i] == '\0') {
226 			if (in_arg == B_TRUE) {
227 				in_arg = B_FALSE;
228 				scratchargs[i] = '\0';
229 				argv[argc++] = ucp - (alen - sarg);
230 			}
231 		} else if (in_arg == B_FALSE) {
232 			in_arg = B_TRUE;
233 			sarg = i;
234 		}
235 	}
236 
237 	exec_fnamep = argv[0];
238 
239 	ucp -= alen;
240 	error |= copyout(scratchargs, (caddr_t)ucp, alen);
241 
242 	if (p->p_model == DATAMODEL_ILP32) {
243 		uintptr32_t *argv32;
244 
245 		argv32 = kmem_zalloc(argvlen / 2, KM_SLEEP);
246 
247 		for (i = 0; i < argc; i++)
248 			argv32[i] = (uintptr32_t)argv[i];
249 
250 		kmem_free(argv, argvlen);
251 		argv = (uintptr_t *)argv32;
252 		argvlen /= 2;
253 
254 		wlen = sizeof (uintptr32_t);
255 	}
256 
257 	uap = P2ALIGN(ucp, wlen);
258 	/* advance to be below the word we're in */
259 	uap -= wlen;
260 	/* advance argc words down, plus one for NULL */
261 	uap -= (argc + 1) * wlen;
262 	error |= copyout(argv, (caddr_t)uap, argvlen);
263 
264 	if (error != 0) {
265 		zcmn_err(p->p_zone->zone_id, CE_WARN,
266 		    "Could not construct stack for init.\n");
267 		kmem_free(argv, argvlen);
268 		kmem_free(scratchargs, alen);
269 		return (EFAULT);
270 	}
271 
272 	kmem_free(argv, argvlen);
273 	kmem_free(scratchargs, alen);
274 
275 	/*
276 	 * Point at the arguments.
277 	 */
278 	lwp->lwp_ap = lwp->lwp_arg;
279 	lwp->lwp_arg[0] = exec_fnamep;
280 	lwp->lwp_arg[1] = uap;
281 	lwp->lwp_arg[2] = 0;
282 	curthread->t_post_sys = 1;
283 	curthread->t_sysnum = SYS_execve;
284 
285 	/*
286 	 * If we are executing init from zsched, we may have inherited its
287 	 * parent process's signal mask.  Clear it now so that we behave in
288 	 * the same way as when started from the global zone.
289 	 */
290 	sigemptyset(&curthread->t_hold);
291 
292 	brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE;
293 again:
294 	error = exec_common((const char *)exec_fnamep,
295 	    (const char **)uap, NULL, brand_action);
296 
297 	/*
298 	 * Normally we would just set lwp_argsaved and t_post_sys and
299 	 * let post_syscall reset lwp_ap for us.  Unfortunately,
300 	 * exec_init isn't always called from a system call.  Instead
301 	 * of making a mess of trap_cleanup, we just reset the args
302 	 * pointer here.
303 	 */
304 	reset_syscall_args();
305 
306 	switch (error) {
307 	case 0:
308 		return (0);
309 
310 	case ENOENT:
311 		zcmn_err(p->p_zone->zone_id, CE_WARN,
312 		    "exec(%s) failed (file not found).\n", initpath);
313 		return (ENOENT);
314 
315 	case EAGAIN:
316 	case EINTR:
317 		++count;
318 		if (count < 5) {
319 			zcmn_err(p->p_zone->zone_id, CE_WARN,
320 			    "exec(%s) failed with errno %d.  Retrying...\n",
321 			    initpath, error);
322 			goto again;
323 		}
324 	}
325 
326 	zcmn_err(p->p_zone->zone_id, CE_WARN,
327 	    "exec(%s) failed with errno %d.", initpath, error);
328 	return (error);
329 }
330 
331 /*
332  * This routine does all of the common setup for invoking init; global
333  * and non-global zones employ this routine for the functionality which is
334  * in common.
335  *
336  * This program (init, presumably) must be a 32-bit process.
337  */
338 int
339 start_init_common()
340 {
341 	proc_t *p = curproc;
342 	ASSERT_STACK_ALIGNED();
343 	p->p_zone->zone_proc_initpid = p->p_pid;
344 
345 	p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0;
346 	p->p_usrstack = (caddr_t)USRSTACK32;
347 	p->p_model = DATAMODEL_ILP32;
348 	p->p_stkprot = PROT_ZFOD & ~PROT_EXEC;
349 	p->p_datprot = PROT_ZFOD & ~PROT_EXEC;
350 	p->p_stk_ctl = INT32_MAX;
351 
352 	p->p_as = as_alloc();
353 	p->p_as->a_proc = p;
354 	p->p_as->a_userlimit = (caddr_t)USERLIMIT32;
355 	(void) hat_setup(p->p_as->a_hat, HAT_INIT);
356 
357 	init_core();
358 
359 	init_mstate(curthread, LMS_SYSTEM);
360 	return (exec_init(p->p_zone->zone_initname, p->p_zone->zone_bootargs));
361 }
362 
363 /*
364  * Start the initial user process for the global zone; once running, if
365  * init should subsequently fail, it will be automatically be caught in the
366  * exit(2) path, and restarted by restart_init().
367  */
368 static void
369 start_init(void)
370 {
371 	proc_init = curproc;
372 
373 	ASSERT(curproc->p_zone->zone_initname != NULL);
374 
375 	if (start_init_common() != 0)
376 		halt("unix: Could not start init");
377 	lwp_rtt();
378 }
379 
380 void
381 main(void)
382 {
383 	proc_t		*p = ttoproc(curthread);	/* &p0 */
384 	int		(**initptr)();
385 	extern void	sched();
386 	extern void	fsflush();
387 	extern int	(*init_tbl[])();
388 	extern int	(*mp_init_tbl[])();
389 	extern id_t	syscid, defaultcid;
390 	extern int	swaploaded;
391 	extern int	netboot;
392 	extern ib_boot_prop_t *iscsiboot_prop;
393 	extern void	vm_init(void);
394 	extern void	cbe_init_pre(void);
395 	extern void	cbe_init(void);
396 	extern void	clock_tick_init_pre(void);
397 	extern void	clock_tick_init_post(void);
398 	extern void	clock_init(void);
399 	extern void	physio_bufs_init(void);
400 	extern void	pm_cfb_setup_intr(void);
401 	extern int	pm_adjust_timestamps(dev_info_t *, void *);
402 	extern void	start_other_cpus(int);
403 	extern void	sysevent_evc_thrinit();
404 	extern kmutex_t	ualock;
405 #if defined(__x86)
406 	extern void	fastboot_post_startup(void);
407 	extern void	progressbar_start(void);
408 #endif
409 	/*
410 	 * In the horrible world of x86 in-lines, you can't get symbolic
411 	 * structure offsets a la genassym.  This assertion is here so
412 	 * that the next poor slob who innocently changes the offset of
413 	 * cpu_thread doesn't waste as much time as I just did finding
414 	 * out that it's hard-coded in i86/ml/i86.il.  Similarly for
415 	 * curcpup.  You're welcome.
416 	 */
417 	ASSERT(CPU == CPU->cpu_self);
418 	ASSERT(curthread == CPU->cpu_thread);
419 	ASSERT_STACK_ALIGNED();
420 
421 	/*
422 	 * We take the ualock until we have completed the startup
423 	 * to prevent kadmin() from disrupting this work. In particular,
424 	 * we don't want kadmin() to bring the system down while we are
425 	 * trying to start it up.
426 	 */
427 	mutex_enter(&ualock);
428 
429 	/*
430 	 * Setup root lgroup and leaf lgroup for CPU 0
431 	 */
432 	lgrp_init(LGRP_INIT_STAGE2);
433 
434 	/*
435 	 * Once 'startup()' completes, the thread_reaper() daemon would be
436 	 * created(in thread_init()). After that, it is safe to create threads
437 	 * that could exit. These exited threads will get reaped.
438 	 */
439 	startup();
440 	segkmem_gc();
441 	callb_init();
442 	cbe_init_pre();	/* x86 must initialize gethrtimef before timer_init */
443 	ddi_periodic_init();
444 	cbe_init();
445 	callout_init();	/* callout table MUST be init'd after cyclics */
446 	clock_tick_init_pre();
447 	clock_init();
448 
449 #if defined(__x86)
450 	/*
451 	 * The progressbar thread uses cv_reltimedwait() and hence needs to be
452 	 * started after the callout mechanism has been initialized.
453 	 */
454 	progressbar_start();
455 #endif
456 	/*
457 	 * On some platforms, clkinitf() changes the timing source that
458 	 * gethrtime_unscaled() uses to generate timestamps.  cbe_init() calls
459 	 * clkinitf(), so re-initialize the microstate counters after the
460 	 * timesource has been chosen.
461 	 */
462 	init_mstate(&t0, LMS_SYSTEM);
463 	init_cpu_mstate(CPU, CMS_SYSTEM);
464 
465 	/*
466 	 * May need to probe to determine latencies from CPU 0 after
467 	 * gethrtime() comes alive in cbe_init() and before enabling interrupts
468 	 * and copy and release any temporary memory allocated with BOP_ALLOC()
469 	 * before release_bootstrap() frees boot memory
470 	 */
471 	lgrp_init(LGRP_INIT_STAGE3);
472 
473 	/*
474 	 * Call all system initialization functions.
475 	 */
476 	for (initptr = &init_tbl[0]; *initptr; initptr++)
477 		(**initptr)();
478 	/*
479 	 * Load iSCSI boot properties
480 	 */
481 	ld_ib_prop();
482 	/*
483 	 * initialize vm related stuff.
484 	 */
485 	vm_init();
486 
487 	/*
488 	 * initialize buffer pool for raw I/O requests
489 	 */
490 	physio_bufs_init();
491 
492 	ttolwp(curthread)->lwp_error = 0; /* XXX kludge for SCSI driver */
493 
494 	/*
495 	 * Drop the interrupt level and allow interrupts.  At this point
496 	 * the DDI guarantees that interrupts are enabled.
497 	 */
498 	(void) spl0();
499 	interrupts_unleashed = 1;
500 
501 	/*
502 	 * Create kmem cache for proc structures
503 	 */
504 	process_cache = kmem_cache_create("process_cache", sizeof (proc_t),
505 	    0, NULL, NULL, NULL, NULL, NULL, 0);
506 
507 	vfs_mountroot();	/* Mount the root file system */
508 	errorq_init();		/* after vfs_mountroot() so DDI root is ready */
509 	cpu_kstat_init(CPU);	/* after vfs_mountroot() so TOD is valid */
510 	ddi_walk_devs(ddi_root_node(), pm_adjust_timestamps, NULL);
511 				/* after vfs_mountroot() so hrestime is valid */
512 
513 	post_startup();
514 	swaploaded = 1;
515 
516 	/*
517 	 * Initialize Solaris Audit Subsystem
518 	 */
519 	audit_init();
520 
521 	/*
522 	 * Start the periodic hash rescale for all vmem arenas before we load
523 	 * protocol modules and drivers via strplumb() below.  Some drivers
524 	 * might rely on heavy vmem operations that could hurt performance
525 	 * without the rescale.
526 	 */
527 	vmem_update(NULL);
528 
529 	/*
530 	 * Plumb the protocol modules and drivers only if we are not
531 	 * networked booted, in this case we already did it in rootconf().
532 	 */
533 	if (netboot == 0 && iscsiboot_prop == NULL)
534 		(void) strplumb();
535 
536 	gethrestime(&PTOU(curproc)->u_start);
537 	curthread->t_start = PTOU(curproc)->u_start.tv_sec;
538 	p->p_mstart = gethrtime();
539 
540 	/*
541 	 * Perform setup functions that can only be done after root
542 	 * and swap have been set up.
543 	 */
544 	consconfig();
545 #ifndef	__sparc
546 	release_bootstrap();
547 #endif
548 
549 	/*
550 	 * attach drivers with ddi-forceattach prop
551 	 * It must be done early enough to load hotplug drivers (e.g.
552 	 * pcmcia nexus) so that devices enumerated via hotplug is
553 	 * available before I/O subsystem is fully initialized.
554 	 */
555 	i_ddi_forceattach_drivers();
556 
557 	/*
558 	 * Set the scan rate and other parameters of the paging subsystem.
559 	 */
560 	setupclock();
561 
562 	/*
563 	 * Initialize process 0's lwp directory and lwpid hash table.
564 	 */
565 	p->p_lwpdir = p->p_lwpfree = p0_lwpdir;
566 	p->p_lwpdir->ld_next = p->p_lwpdir + 1;
567 	p->p_lwpdir_sz = 2;
568 	p->p_tidhash = p0_tidhash;
569 	p->p_tidhash_sz = 2;
570 	p0_lep.le_thread = curthread;
571 	p0_lep.le_lwpid = curthread->t_tid;
572 	p0_lep.le_start = curthread->t_start;
573 	lwp_hash_in(p, &p0_lep, p0_tidhash, 2, 0);
574 
575 	/*
576 	 * Initialize extended accounting.
577 	 */
578 	exacct_init();
579 
580 	/*
581 	 * Initialize threads of sysevent event channels
582 	 */
583 	sysevent_evc_thrinit();
584 
585 	/*
586 	 * This must be done after post_startup() but before
587 	 * start_other_cpus()
588 	 */
589 	lgrp_init(LGRP_INIT_STAGE4);
590 
591 	/*
592 	 * Perform MP initialization, if any.
593 	 */
594 	start_other_cpus(0);
595 
596 #ifdef	__sparc
597 	/*
598 	 * Release bootstrap here since PROM interfaces are
599 	 * used to start other CPUs above.
600 	 */
601 	release_bootstrap();
602 #endif
603 
604 	/*
605 	 * Finish lgrp initialization after all CPUS are brought online.
606 	 */
607 	lgrp_init(LGRP_INIT_STAGE5);
608 
609 	/*
610 	 * After mp_init(), number of cpus are known (this is
611 	 * true for the time being, when there are actually
612 	 * hot pluggable cpus then this scheme  would not do).
613 	 * Any per cpu initialization is done here.
614 	 */
615 	kmem_mp_init();
616 
617 	clock_tick_init_post();
618 
619 	for (initptr = &mp_init_tbl[0]; *initptr; initptr++)
620 		(**initptr)();
621 
622 	/*
623 	 * These must be called after start_other_cpus
624 	 */
625 	pm_cfb_setup_intr();
626 #if defined(__x86)
627 	fastboot_post_startup();
628 
629 	smt_late_init();
630 #endif
631 
632 	/*
633 	 * Make init process; enter scheduling loop with system process.
634 	 *
635 	 * Note that we manually assign the pids for these processes, for
636 	 * historical reasons.  If more pre-assigned pids are needed,
637 	 * FAMOUS_PIDS will have to be updated.
638 	 */
639 
640 	/* create init process */
641 	if (newproc(start_init, NULL, defaultcid, 59, NULL,
642 	    FAMOUS_PID_INIT))
643 		panic("main: unable to fork init.");
644 
645 	/* create pageout daemon */
646 	if (newproc(pageout, NULL, syscid, maxclsyspri - 1, NULL,
647 	    FAMOUS_PID_PAGEOUT))
648 		panic("main: unable to fork pageout()");
649 
650 	/* create fsflush daemon */
651 	if (newproc(fsflush, NULL, syscid, minclsyspri, NULL,
652 	    FAMOUS_PID_FSFLUSH))
653 		panic("main: unable to fork fsflush()");
654 
655 	/* create cluster process if we're a member of one */
656 	if (cluster_bootflags & CLUSTER_BOOTED) {
657 		if (newproc(cluster_wrapper, NULL, syscid, minclsyspri,
658 		    NULL, 0)) {
659 			panic("main: unable to fork cluster()");
660 		}
661 	}
662 
663 	/*
664 	 * Create system threads (threads are associated with p0)
665 	 */
666 
667 	/* create module uninstall daemon */
668 	/* BugID 1132273. If swapping over NFS need a bigger stack */
669 	(void) thread_create(NULL, 0, (void (*)())mod_uninstall_daemon,
670 	    NULL, 0, &p0, TS_RUN, minclsyspri);
671 
672 	(void) thread_create(NULL, 0, seg_pasync_thread,
673 	    NULL, 0, &p0, TS_RUN, minclsyspri);
674 
675 	pid_setmin();
676 
677 	/* system is now ready */
678 	mutex_exit(&ualock);
679 
680 	bcopy("sched", PTOU(curproc)->u_psargs, 6);
681 	bcopy("sched", PTOU(curproc)->u_comm, 5);
682 	sched();
683 	/* NOTREACHED */
684 }
685