1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* Copyright (c) 1988 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Copyright 2019 Joyent, Inc. 31 */ 32 33 #include <sys/types.h> 34 #include <sys/param.h> 35 #include <sys/sysmacros.h> 36 #include <sys/pcb.h> 37 #include <sys/systm.h> 38 #include <sys/signal.h> 39 #include <sys/cred.h> 40 #include <sys/user.h> 41 #include <sys/vfs.h> 42 #include <sys/vnode.h> 43 #include <sys/proc.h> 44 #include <sys/time.h> 45 #include <sys/file.h> 46 #include <sys/priocntl.h> 47 #include <sys/procset.h> 48 #include <sys/disp.h> 49 #include <sys/callo.h> 50 #include <sys/callb.h> 51 #include <sys/debug.h> 52 #include <sys/conf.h> 53 #include <sys/bootconf.h> 54 #include <sys/utsname.h> 55 #include <sys/cmn_err.h> 56 #include <sys/vmparam.h> 57 #include <sys/modctl.h> 58 #include <sys/vm.h> 59 #include <sys/callb.h> 60 #include <sys/ddi_periodic.h> 61 #include <sys/kmem.h> 62 #include <sys/vmem.h> 63 #include <sys/cpuvar.h> 64 #include <sys/cladm.h> 65 #include <sys/corectl.h> 66 #include <sys/exec.h> 67 #include <sys/syscall.h> 68 #include <sys/reboot.h> 69 #include <sys/task.h> 70 #include <sys/exacct.h> 71 #include <sys/autoconf.h> 72 #include <sys/errorq.h> 73 #include <sys/class.h> 74 #include <sys/stack.h> 75 #include <sys/brand.h> 76 #include <sys/mmapobj.h> 77 #include <sys/smt.h> 78 79 #include <vm/as.h> 80 #include <vm/seg_kmem.h> 81 #include <sys/dc_ki.h> 82 83 #include <c2/audit.h> 84 #include <sys/bootprops.h> 85 86 /* well known processes */ 87 proc_t *proc_sched; /* memory scheduler */ 88 proc_t *proc_init; /* init */ 89 proc_t *proc_pageout; /* pageout daemon */ 90 proc_t *proc_fsflush; /* fsflush daemon */ 91 92 pgcnt_t maxmem; /* Maximum available memory in pages. */ 93 pgcnt_t freemem; /* Current available memory in pages. */ 94 int interrupts_unleashed; /* set when we do the first spl0() */ 95 96 kmem_cache_t *process_cache; /* kmem cache for proc structures */ 97 98 /* 99 * Indicates whether the auditing module (c2audit) is loaded. Possible 100 * values are: 101 * 0 - c2audit module is excluded in /etc/system and cannot be loaded 102 * 1 - c2audit module is not loaded but can be anytime 103 * 2 - c2audit module is loaded 104 */ 105 int audit_active = C2AUDIT_DISABLED; 106 107 /* 108 * Process 0's lwp directory and lwpid hash table. 109 */ 110 lwpdir_t p0_lwpdir[2]; 111 tidhash_t p0_tidhash[2]; 112 lwpent_t p0_lep; 113 114 /* 115 * Machine-independent initialization code 116 * Called from cold start routine as 117 * soon as a stack and segmentation 118 * have been established. 119 * Functions: 120 * clear and free user core 121 * turn on clock 122 * hand craft 0th process 123 * call all initialization routines 124 * fork - process 0 to schedule 125 * - process 1 execute bootstrap 126 * - process 2 to page out 127 * create system threads 128 */ 129 130 int cluster_bootflags = 0; 131 132 void 133 cluster_wrapper(void) 134 { 135 cluster(); 136 panic("cluster() returned"); 137 } 138 139 char initname[INITNAME_SZ] = "/sbin/init"; /* also referenced by zone0 */ 140 char initargs[BOOTARGS_MAX] = ""; /* also referenced by zone0 */ 141 142 /* 143 * Construct a stack for init containing the arguments to it, then 144 * pass control to exec_common. 145 */ 146 int 147 exec_init(const char *initpath, const char *args) 148 { 149 uintptr_t ucp; 150 uintptr_t uap; 151 uintptr_t *argv; 152 uintptr_t exec_fnamep; 153 char *scratchargs; 154 int i, sarg; 155 size_t argvlen, alen; 156 size_t wlen = sizeof (uintptr_t); 157 boolean_t in_arg; 158 int argc = 0; 159 int error = 0, count = 0; 160 proc_t *p = ttoproc(curthread); 161 klwp_t *lwp = ttolwp(curthread); 162 int brand_action; 163 164 if (args == NULL) 165 args = ""; 166 167 alen = strlen(initpath) + 1 + strlen(args) + 1; 168 scratchargs = kmem_alloc(alen, KM_SLEEP); 169 (void) snprintf(scratchargs, alen, "%s %s", initpath, args); 170 171 /* 172 * We do a quick two state parse of the string to sort out how big 173 * argc should be. 174 */ 175 in_arg = B_FALSE; 176 for (i = 0; i < strlen(scratchargs); i++) { 177 if (scratchargs[i] == ' ' || scratchargs[i] == '\0') { 178 if (in_arg) { 179 in_arg = B_FALSE; 180 argc++; 181 } 182 } else { 183 in_arg = B_TRUE; 184 } 185 } 186 argvlen = sizeof (uintptr_t) * (argc + 1); 187 argv = kmem_zalloc(argvlen, KM_SLEEP); 188 189 /* 190 * We pull off a bit of a hack here. We work our way through the 191 * args string, putting nulls at the ends of space delimited tokens 192 * (boot args don't support quoting at this time). Then we just 193 * copy the whole mess to userland in one go. In other words, we 194 * transform this: "init -s -r\0" into this on the stack: 195 * 196 * -0x00 \0 197 * -0x01 r 198 * -0x02 - <--------. 199 * -0x03 \0 | 200 * -0x04 s | 201 * -0x05 - <------. | 202 * -0x06 \0 | | 203 * -0x07 t | | 204 * -0x08 i | | 205 * -0x09 n | | 206 * -0x0a i <---. | | 207 * -0x10 NULL | | | (argv[3]) 208 * -0x14 -----|--|-' (argv[2]) 209 * -0x18 ------|--' (argv[1]) 210 * -0x1c -------' (argv[0]) 211 * 212 * Since we know the value of ucp at the beginning of this process, 213 * we can trivially compute the argv[] array which we also need to 214 * place in userland: argv[i] = ucp - sarg(i), where ucp is the 215 * stack ptr, and sarg is the string index of the start of the 216 * argument. 217 */ 218 ucp = (uintptr_t)p->p_usrstack; 219 220 argc = 0; 221 in_arg = B_FALSE; 222 sarg = 0; 223 224 for (i = 0; i < alen; i++) { 225 if (scratchargs[i] == ' ' || scratchargs[i] == '\0') { 226 if (in_arg == B_TRUE) { 227 in_arg = B_FALSE; 228 scratchargs[i] = '\0'; 229 argv[argc++] = ucp - (alen - sarg); 230 } 231 } else if (in_arg == B_FALSE) { 232 in_arg = B_TRUE; 233 sarg = i; 234 } 235 } 236 237 exec_fnamep = argv[0]; 238 239 ucp -= alen; 240 error |= copyout(scratchargs, (caddr_t)ucp, alen); 241 242 if (p->p_model == DATAMODEL_ILP32) { 243 uintptr32_t *argv32; 244 245 argv32 = kmem_zalloc(argvlen / 2, KM_SLEEP); 246 247 for (i = 0; i < argc; i++) 248 argv32[i] = (uintptr32_t)argv[i]; 249 250 kmem_free(argv, argvlen); 251 argv = (uintptr_t *)argv32; 252 argvlen /= 2; 253 254 wlen = sizeof (uintptr32_t); 255 } 256 257 uap = P2ALIGN(ucp, wlen); 258 /* advance to be below the word we're in */ 259 uap -= wlen; 260 /* advance argc words down, plus one for NULL */ 261 uap -= (argc + 1) * wlen; 262 error |= copyout(argv, (caddr_t)uap, argvlen); 263 264 if (error != 0) { 265 zcmn_err(p->p_zone->zone_id, CE_WARN, 266 "Could not construct stack for init.\n"); 267 kmem_free(argv, argvlen); 268 kmem_free(scratchargs, alen); 269 return (EFAULT); 270 } 271 272 kmem_free(argv, argvlen); 273 kmem_free(scratchargs, alen); 274 275 /* 276 * Point at the arguments. 277 */ 278 lwp->lwp_ap = lwp->lwp_arg; 279 lwp->lwp_arg[0] = exec_fnamep; 280 lwp->lwp_arg[1] = uap; 281 lwp->lwp_arg[2] = 0; 282 curthread->t_post_sys = 1; 283 curthread->t_sysnum = SYS_execve; 284 285 /* 286 * If we are executing init from zsched, we may have inherited its 287 * parent process's signal mask. Clear it now so that we behave in 288 * the same way as when started from the global zone. 289 */ 290 sigemptyset(&curthread->t_hold); 291 292 brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE; 293 again: 294 error = exec_common((const char *)exec_fnamep, 295 (const char **)uap, NULL, brand_action); 296 297 /* 298 * Normally we would just set lwp_argsaved and t_post_sys and 299 * let post_syscall reset lwp_ap for us. Unfortunately, 300 * exec_init isn't always called from a system call. Instead 301 * of making a mess of trap_cleanup, we just reset the args 302 * pointer here. 303 */ 304 reset_syscall_args(); 305 306 switch (error) { 307 case 0: 308 return (0); 309 310 case ENOENT: 311 zcmn_err(p->p_zone->zone_id, CE_WARN, 312 "exec(%s) failed (file not found).\n", initpath); 313 return (ENOENT); 314 315 case EAGAIN: 316 case EINTR: 317 ++count; 318 if (count < 5) { 319 zcmn_err(p->p_zone->zone_id, CE_WARN, 320 "exec(%s) failed with errno %d. Retrying...\n", 321 initpath, error); 322 goto again; 323 } 324 } 325 326 zcmn_err(p->p_zone->zone_id, CE_WARN, 327 "exec(%s) failed with errno %d.", initpath, error); 328 return (error); 329 } 330 331 /* 332 * This routine does all of the common setup for invoking init; global 333 * and non-global zones employ this routine for the functionality which is 334 * in common. 335 * 336 * This program (init, presumably) must be a 32-bit process. 337 */ 338 int 339 start_init_common() 340 { 341 proc_t *p = curproc; 342 ASSERT_STACK_ALIGNED(); 343 p->p_zone->zone_proc_initpid = p->p_pid; 344 345 p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0; 346 p->p_usrstack = (caddr_t)USRSTACK32; 347 p->p_model = DATAMODEL_ILP32; 348 p->p_stkprot = PROT_ZFOD & ~PROT_EXEC; 349 p->p_datprot = PROT_ZFOD & ~PROT_EXEC; 350 p->p_stk_ctl = INT32_MAX; 351 352 p->p_as = as_alloc(); 353 p->p_as->a_proc = p; 354 p->p_as->a_userlimit = (caddr_t)USERLIMIT32; 355 (void) hat_setup(p->p_as->a_hat, HAT_INIT); 356 357 init_core(); 358 359 init_mstate(curthread, LMS_SYSTEM); 360 return (exec_init(p->p_zone->zone_initname, p->p_zone->zone_bootargs)); 361 } 362 363 /* 364 * Start the initial user process for the global zone; once running, if 365 * init should subsequently fail, it will be automatically be caught in the 366 * exit(2) path, and restarted by restart_init(). 367 */ 368 static void 369 start_init(void) 370 { 371 proc_init = curproc; 372 373 ASSERT(curproc->p_zone->zone_initname != NULL); 374 375 if (start_init_common() != 0) 376 halt("unix: Could not start init"); 377 lwp_rtt(); 378 } 379 380 void 381 main(void) 382 { 383 proc_t *p = ttoproc(curthread); /* &p0 */ 384 int (**initptr)(); 385 extern void sched(); 386 extern void fsflush(); 387 extern int (*init_tbl[])(); 388 extern int (*mp_init_tbl[])(); 389 extern id_t syscid, defaultcid; 390 extern int swaploaded; 391 extern int netboot; 392 extern ib_boot_prop_t *iscsiboot_prop; 393 extern void vm_init(void); 394 extern void cbe_init_pre(void); 395 extern void cbe_init(void); 396 extern void clock_tick_init_pre(void); 397 extern void clock_tick_init_post(void); 398 extern void clock_init(void); 399 extern void physio_bufs_init(void); 400 extern void pm_cfb_setup_intr(void); 401 extern int pm_adjust_timestamps(dev_info_t *, void *); 402 extern void start_other_cpus(int); 403 extern void sysevent_evc_thrinit(); 404 extern kmutex_t ualock; 405 #if defined(__x86) 406 extern void fastboot_post_startup(void); 407 extern void progressbar_start(void); 408 #endif 409 /* 410 * In the horrible world of x86 in-lines, you can't get symbolic 411 * structure offsets a la genassym. This assertion is here so 412 * that the next poor slob who innocently changes the offset of 413 * cpu_thread doesn't waste as much time as I just did finding 414 * out that it's hard-coded in i86/ml/i86.il. Similarly for 415 * curcpup. You're welcome. 416 */ 417 ASSERT(CPU == CPU->cpu_self); 418 ASSERT(curthread == CPU->cpu_thread); 419 ASSERT_STACK_ALIGNED(); 420 421 /* 422 * We take the ualock until we have completed the startup 423 * to prevent kadmin() from disrupting this work. In particular, 424 * we don't want kadmin() to bring the system down while we are 425 * trying to start it up. 426 */ 427 mutex_enter(&ualock); 428 429 /* 430 * Setup root lgroup and leaf lgroup for CPU 0 431 */ 432 lgrp_init(LGRP_INIT_STAGE2); 433 434 /* 435 * Once 'startup()' completes, the thread_reaper() daemon would be 436 * created(in thread_init()). After that, it is safe to create threads 437 * that could exit. These exited threads will get reaped. 438 */ 439 startup(); 440 segkmem_gc(); 441 callb_init(); 442 cbe_init_pre(); /* x86 must initialize gethrtimef before timer_init */ 443 ddi_periodic_init(); 444 cbe_init(); 445 callout_init(); /* callout table MUST be init'd after cyclics */ 446 clock_tick_init_pre(); 447 clock_init(); 448 449 #if defined(__x86) 450 /* 451 * The progressbar thread uses cv_reltimedwait() and hence needs to be 452 * started after the callout mechanism has been initialized. 453 */ 454 progressbar_start(); 455 #endif 456 /* 457 * On some platforms, clkinitf() changes the timing source that 458 * gethrtime_unscaled() uses to generate timestamps. cbe_init() calls 459 * clkinitf(), so re-initialize the microstate counters after the 460 * timesource has been chosen. 461 */ 462 init_mstate(&t0, LMS_SYSTEM); 463 init_cpu_mstate(CPU, CMS_SYSTEM); 464 465 /* 466 * May need to probe to determine latencies from CPU 0 after 467 * gethrtime() comes alive in cbe_init() and before enabling interrupts 468 * and copy and release any temporary memory allocated with BOP_ALLOC() 469 * before release_bootstrap() frees boot memory 470 */ 471 lgrp_init(LGRP_INIT_STAGE3); 472 473 /* 474 * Call all system initialization functions. 475 */ 476 for (initptr = &init_tbl[0]; *initptr; initptr++) 477 (**initptr)(); 478 /* 479 * Load iSCSI boot properties 480 */ 481 ld_ib_prop(); 482 /* 483 * initialize vm related stuff. 484 */ 485 vm_init(); 486 487 /* 488 * initialize buffer pool for raw I/O requests 489 */ 490 physio_bufs_init(); 491 492 ttolwp(curthread)->lwp_error = 0; /* XXX kludge for SCSI driver */ 493 494 /* 495 * Drop the interrupt level and allow interrupts. At this point 496 * the DDI guarantees that interrupts are enabled. 497 */ 498 (void) spl0(); 499 interrupts_unleashed = 1; 500 501 /* 502 * Create kmem cache for proc structures 503 */ 504 process_cache = kmem_cache_create("process_cache", sizeof (proc_t), 505 0, NULL, NULL, NULL, NULL, NULL, 0); 506 507 vfs_mountroot(); /* Mount the root file system */ 508 errorq_init(); /* after vfs_mountroot() so DDI root is ready */ 509 cpu_kstat_init(CPU); /* after vfs_mountroot() so TOD is valid */ 510 ddi_walk_devs(ddi_root_node(), pm_adjust_timestamps, NULL); 511 /* after vfs_mountroot() so hrestime is valid */ 512 513 post_startup(); 514 swaploaded = 1; 515 516 /* 517 * Initialize Solaris Audit Subsystem 518 */ 519 audit_init(); 520 521 /* 522 * Plumb the protocol modules and drivers only if we are not 523 * networked booted, in this case we already did it in rootconf(). 524 */ 525 if (netboot == 0 && iscsiboot_prop == NULL) 526 (void) strplumb(); 527 528 gethrestime(&PTOU(curproc)->u_start); 529 curthread->t_start = PTOU(curproc)->u_start.tv_sec; 530 p->p_mstart = gethrtime(); 531 532 /* 533 * Perform setup functions that can only be done after root 534 * and swap have been set up. 535 */ 536 consconfig(); 537 #ifndef __sparc 538 release_bootstrap(); 539 #endif 540 541 /* 542 * attach drivers with ddi-forceattach prop 543 * It must be done early enough to load hotplug drivers (e.g. 544 * pcmcia nexus) so that devices enumerated via hotplug is 545 * available before I/O subsystem is fully initialized. 546 */ 547 i_ddi_forceattach_drivers(); 548 549 /* 550 * Set the scan rate and other parameters of the paging subsystem. 551 */ 552 setupclock(0); 553 554 /* 555 * Initialize process 0's lwp directory and lwpid hash table. 556 */ 557 p->p_lwpdir = p->p_lwpfree = p0_lwpdir; 558 p->p_lwpdir->ld_next = p->p_lwpdir + 1; 559 p->p_lwpdir_sz = 2; 560 p->p_tidhash = p0_tidhash; 561 p->p_tidhash_sz = 2; 562 p0_lep.le_thread = curthread; 563 p0_lep.le_lwpid = curthread->t_tid; 564 p0_lep.le_start = curthread->t_start; 565 lwp_hash_in(p, &p0_lep, p0_tidhash, 2, 0); 566 567 /* 568 * Initialize extended accounting. 569 */ 570 exacct_init(); 571 572 /* 573 * Initialize threads of sysevent event channels 574 */ 575 sysevent_evc_thrinit(); 576 577 /* 578 * This must be done after post_startup() but before 579 * start_other_cpus() 580 */ 581 lgrp_init(LGRP_INIT_STAGE4); 582 583 /* 584 * Perform MP initialization, if any. 585 */ 586 start_other_cpus(0); 587 588 #ifdef __sparc 589 /* 590 * Release bootstrap here since PROM interfaces are 591 * used to start other CPUs above. 592 */ 593 release_bootstrap(); 594 #endif 595 596 /* 597 * Finish lgrp initialization after all CPUS are brought online. 598 */ 599 lgrp_init(LGRP_INIT_STAGE5); 600 601 /* 602 * After mp_init(), number of cpus are known (this is 603 * true for the time being, when there are actually 604 * hot pluggable cpus then this scheme would not do). 605 * Any per cpu initialization is done here. 606 */ 607 kmem_mp_init(); 608 vmem_update(NULL); 609 610 clock_tick_init_post(); 611 612 for (initptr = &mp_init_tbl[0]; *initptr; initptr++) 613 (**initptr)(); 614 615 /* 616 * These must be called after start_other_cpus 617 */ 618 pm_cfb_setup_intr(); 619 #if defined(__x86) 620 fastboot_post_startup(); 621 622 smt_late_init(); 623 #endif 624 625 /* 626 * Make init process; enter scheduling loop with system process. 627 * 628 * Note that we manually assign the pids for these processes, for 629 * historical reasons. If more pre-assigned pids are needed, 630 * FAMOUS_PIDS will have to be updated. 631 */ 632 633 /* create init process */ 634 if (newproc(start_init, NULL, defaultcid, 59, NULL, 635 FAMOUS_PID_INIT)) 636 panic("main: unable to fork init."); 637 638 /* create pageout daemon */ 639 if (newproc(pageout, NULL, syscid, maxclsyspri - 1, NULL, 640 FAMOUS_PID_PAGEOUT)) 641 panic("main: unable to fork pageout()"); 642 643 /* create fsflush daemon */ 644 if (newproc(fsflush, NULL, syscid, minclsyspri, NULL, 645 FAMOUS_PID_FSFLUSH)) 646 panic("main: unable to fork fsflush()"); 647 648 /* create cluster process if we're a member of one */ 649 if (cluster_bootflags & CLUSTER_BOOTED) { 650 if (newproc(cluster_wrapper, NULL, syscid, minclsyspri, 651 NULL, 0)) { 652 panic("main: unable to fork cluster()"); 653 } 654 } 655 656 /* 657 * Create system threads (threads are associated with p0) 658 */ 659 660 /* create module uninstall daemon */ 661 /* BugID 1132273. If swapping over NFS need a bigger stack */ 662 (void) thread_create(NULL, 0, (void (*)())mod_uninstall_daemon, 663 NULL, 0, &p0, TS_RUN, minclsyspri); 664 665 (void) thread_create(NULL, 0, seg_pasync_thread, 666 NULL, 0, &p0, TS_RUN, minclsyspri); 667 668 pid_setmin(); 669 670 /* system is now ready */ 671 mutex_exit(&ualock); 672 673 bcopy("sched", PTOU(curproc)->u_psargs, 6); 674 bcopy("sched", PTOU(curproc)->u_comm, 5); 675 sched(); 676 /* NOTREACHED */ 677 } 678