1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1988 AT&T */ 27 /* All Rights Reserved */ 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/sysmacros.h> 32 #include <sys/pcb.h> 33 #include <sys/systm.h> 34 #include <sys/signal.h> 35 #include <sys/cred.h> 36 #include <sys/user.h> 37 #include <sys/vfs.h> 38 #include <sys/vnode.h> 39 #include <sys/proc.h> 40 #include <sys/time.h> 41 #include <sys/file.h> 42 #include <sys/priocntl.h> 43 #include <sys/procset.h> 44 #include <sys/disp.h> 45 #include <sys/callo.h> 46 #include <sys/callb.h> 47 #include <sys/debug.h> 48 #include <sys/conf.h> 49 #include <sys/bootconf.h> 50 #include <sys/utsname.h> 51 #include <sys/cmn_err.h> 52 #include <sys/vmparam.h> 53 #include <sys/modctl.h> 54 #include <sys/vm.h> 55 #include <sys/callb.h> 56 #include <sys/ddi_timer.h> 57 #include <sys/kmem.h> 58 #include <sys/vmem.h> 59 #include <sys/cpuvar.h> 60 #include <sys/cladm.h> 61 #include <sys/corectl.h> 62 #include <sys/exec.h> 63 #include <sys/syscall.h> 64 #include <sys/reboot.h> 65 #include <sys/task.h> 66 #include <sys/exacct.h> 67 #include <sys/autoconf.h> 68 #include <sys/errorq.h> 69 #include <sys/class.h> 70 #include <sys/stack.h> 71 #include <sys/brand.h> 72 #include <sys/mmapobj.h> 73 74 #include <vm/as.h> 75 #include <vm/seg_kmem.h> 76 #include <sys/dc_ki.h> 77 78 #include <c2/audit.h> 79 #include <sys/bootprops.h> 80 81 /* well known processes */ 82 proc_t *proc_sched; /* memory scheduler */ 83 proc_t *proc_init; /* init */ 84 proc_t *proc_pageout; /* pageout daemon */ 85 proc_t *proc_fsflush; /* fsflush daemon */ 86 87 pgcnt_t maxmem; /* Maximum available memory in pages. */ 88 pgcnt_t freemem; /* Current available memory in pages. */ 89 int audit_active; 90 int interrupts_unleashed; /* set when we do the first spl0() */ 91 92 kmem_cache_t *process_cache; /* kmem cache for proc structures */ 93 94 /* 95 * Process 0's lwp directory and lwpid hash table. 96 */ 97 lwpdir_t p0_lwpdir[2]; 98 lwpdir_t *p0_tidhash[2]; 99 lwpent_t p0_lep; 100 101 /* 102 * Machine-independent initialization code 103 * Called from cold start routine as 104 * soon as a stack and segmentation 105 * have been established. 106 * Functions: 107 * clear and free user core 108 * turn on clock 109 * hand craft 0th process 110 * call all initialization routines 111 * fork - process 0 to schedule 112 * - process 1 execute bootstrap 113 * - process 2 to page out 114 * create system threads 115 */ 116 117 int cluster_bootflags = 0; 118 119 void 120 cluster_wrapper(void) 121 { 122 cluster(); 123 panic("cluster() returned"); 124 } 125 126 char initname[INITNAME_SZ] = "/sbin/init"; /* also referenced by zone0 */ 127 char initargs[BOOTARGS_MAX] = ""; /* also referenced by zone0 */ 128 extern int64_t lwp_sigmask(int, uint_t, uint_t); 129 130 /* 131 * Construct a stack for init containing the arguments to it, then 132 * pass control to exec_common. 133 */ 134 int 135 exec_init(const char *initpath, const char *args) 136 { 137 caddr32_t ucp; 138 caddr32_t *uap; 139 caddr32_t *argv; 140 caddr32_t exec_fnamep; 141 char *scratchargs; 142 int i, sarg; 143 size_t argvlen, alen; 144 boolean_t in_arg; 145 int argc = 0; 146 int error = 0, count = 0; 147 proc_t *p = ttoproc(curthread); 148 klwp_t *lwp = ttolwp(curthread); 149 int brand_action; 150 151 if (args == NULL) 152 args = ""; 153 154 alen = strlen(initpath) + 1 + strlen(args) + 1; 155 scratchargs = kmem_alloc(alen, KM_SLEEP); 156 (void) snprintf(scratchargs, alen, "%s %s", initpath, args); 157 158 /* 159 * We do a quick two state parse of the string to sort out how big 160 * argc should be. 161 */ 162 in_arg = B_FALSE; 163 for (i = 0; i < strlen(scratchargs); i++) { 164 if (scratchargs[i] == ' ' || scratchargs[i] == '\0') { 165 if (in_arg) { 166 in_arg = B_FALSE; 167 argc++; 168 } 169 } else { 170 in_arg = B_TRUE; 171 } 172 } 173 argvlen = sizeof (caddr32_t) * (argc + 1); 174 argv = kmem_zalloc(argvlen, KM_SLEEP); 175 176 /* 177 * We pull off a bit of a hack here. We work our way through the 178 * args string, putting nulls at the ends of space delimited tokens 179 * (boot args don't support quoting at this time). Then we just 180 * copy the whole mess to userland in one go. In other words, we 181 * transform this: "init -s -r\0" into this on the stack: 182 * 183 * -0x00 \0 184 * -0x01 r 185 * -0x02 - <--------. 186 * -0x03 \0 | 187 * -0x04 s | 188 * -0x05 - <------. | 189 * -0x06 \0 | | 190 * -0x07 t | | 191 * -0x08 i | | 192 * -0x09 n | | 193 * -0x0a i <---. | | 194 * -0x10 NULL | | | (argv[3]) 195 * -0x14 -----|--|-' (argv[2]) 196 * -0x18 ------|--' (argv[1]) 197 * -0x1c -------' (argv[0]) 198 * 199 * Since we know the value of ucp at the beginning of this process, 200 * we can trivially compute the argv[] array which we also need to 201 * place in userland: argv[i] = ucp - sarg(i), where ucp is the 202 * stack ptr, and sarg is the string index of the start of the 203 * argument. 204 */ 205 ucp = (caddr32_t)(uintptr_t)p->p_usrstack; 206 207 argc = 0; 208 in_arg = B_FALSE; 209 sarg = 0; 210 211 for (i = 0; i < alen; i++) { 212 if (scratchargs[i] == ' ' || scratchargs[i] == '\0') { 213 if (in_arg == B_TRUE) { 214 in_arg = B_FALSE; 215 scratchargs[i] = '\0'; 216 argv[argc++] = ucp - (alen - sarg); 217 } 218 } else if (in_arg == B_FALSE) { 219 in_arg = B_TRUE; 220 sarg = i; 221 } 222 } 223 ucp -= alen; 224 error |= copyout(scratchargs, (caddr_t)(uintptr_t)ucp, alen); 225 226 uap = (caddr32_t *)P2ALIGN((uintptr_t)ucp, sizeof (caddr32_t)); 227 uap--; /* advance to be below the word we're in */ 228 uap -= (argc + 1); /* advance argc words down, plus one for NULL */ 229 error |= copyout(argv, uap, argvlen); 230 231 if (error != 0) { 232 zcmn_err(p->p_zone->zone_id, CE_WARN, 233 "Could not construct stack for init.\n"); 234 kmem_free(argv, argvlen); 235 kmem_free(scratchargs, alen); 236 return (EFAULT); 237 } 238 239 exec_fnamep = argv[0]; 240 kmem_free(argv, argvlen); 241 kmem_free(scratchargs, alen); 242 243 /* 244 * Point at the arguments. 245 */ 246 lwp->lwp_ap = lwp->lwp_arg; 247 lwp->lwp_arg[0] = (uintptr_t)exec_fnamep; 248 lwp->lwp_arg[1] = (uintptr_t)uap; 249 lwp->lwp_arg[2] = NULL; 250 curthread->t_post_sys = 1; 251 curthread->t_sysnum = SYS_execve; 252 253 /* 254 * If we are executing init from zsched, we may have inherited its 255 * parent process's signal mask. Clear it now so that we behave in 256 * the same way as when started from the global zone. 257 */ 258 (void) lwp_sigmask(SIG_UNBLOCK, 0xffffffff, 0xffffffff); 259 260 brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE; 261 again: 262 error = exec_common((const char *)(uintptr_t)exec_fnamep, 263 (const char **)(uintptr_t)uap, NULL, brand_action); 264 265 /* 266 * Normally we would just set lwp_argsaved and t_post_sys and 267 * let post_syscall reset lwp_ap for us. Unfortunately, 268 * exec_init isn't always called from a system call. Instead 269 * of making a mess of trap_cleanup, we just reset the args 270 * pointer here. 271 */ 272 reset_syscall_args(); 273 274 switch (error) { 275 case 0: 276 return (0); 277 278 case ENOENT: 279 zcmn_err(p->p_zone->zone_id, CE_WARN, 280 "exec(%s) failed (file not found).\n", initpath); 281 return (ENOENT); 282 283 case EAGAIN: 284 case EINTR: 285 ++count; 286 if (count < 5) { 287 zcmn_err(p->p_zone->zone_id, CE_WARN, 288 "exec(%s) failed with errno %d. Retrying...\n", 289 initpath, error); 290 goto again; 291 } 292 } 293 294 zcmn_err(p->p_zone->zone_id, CE_WARN, 295 "exec(%s) failed with errno %d.", initpath, error); 296 return (error); 297 } 298 299 /* 300 * This routine does all of the common setup for invoking init; global 301 * and non-global zones employ this routine for the functionality which is 302 * in common. 303 * 304 * This program (init, presumably) must be a 32-bit process. 305 */ 306 int 307 start_init_common() 308 { 309 proc_t *p = curproc; 310 ASSERT_STACK_ALIGNED(); 311 p->p_zone->zone_proc_initpid = p->p_pid; 312 313 p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0; 314 p->p_usrstack = (caddr_t)USRSTACK32; 315 p->p_model = DATAMODEL_ILP32; 316 p->p_stkprot = PROT_ZFOD & ~PROT_EXEC; 317 p->p_datprot = PROT_ZFOD & ~PROT_EXEC; 318 p->p_stk_ctl = INT32_MAX; 319 320 p->p_as = as_alloc(); 321 p->p_as->a_proc = p; 322 p->p_as->a_userlimit = (caddr_t)USERLIMIT32; 323 (void) hat_setup(p->p_as->a_hat, HAT_INIT); 324 325 init_core(); 326 327 init_mstate(curthread, LMS_SYSTEM); 328 return (exec_init(p->p_zone->zone_initname, p->p_zone->zone_bootargs)); 329 } 330 331 /* 332 * Start the initial user process for the global zone; once running, if 333 * init should subsequently fail, it will be automatically be caught in the 334 * exit(2) path, and restarted by restart_init(). 335 */ 336 static void 337 start_init(void) 338 { 339 proc_init = curproc; 340 341 ASSERT(curproc->p_zone->zone_initname != NULL); 342 343 if (start_init_common() != 0) 344 halt("unix: Could not start init"); 345 lwp_rtt(); 346 } 347 348 #if defined(__i386) || defined(__amd64) 349 extern void return_instr(void); 350 void (*rootnex_iommu_add_intr)(void) = (void (*)(void))return_instr; 351 #endif 352 353 void 354 main(void) 355 { 356 proc_t *p = ttoproc(curthread); /* &p0 */ 357 int (**initptr)(); 358 extern void sched(); 359 extern void fsflush(); 360 extern int (*init_tbl[])(); 361 extern int (*mp_init_tbl[])(); 362 extern id_t syscid, defaultcid; 363 extern int swaploaded; 364 extern int netboot; 365 extern ib_boot_prop_t *iscsiboot_prop; 366 extern void vm_init(void); 367 extern void cbe_init_pre(void); 368 extern void cbe_init(void); 369 extern void clock_tick_init_pre(void); 370 extern void clock_tick_init_post(void); 371 extern void clock_init(void); 372 extern void physio_bufs_init(void); 373 extern void pm_cfb_setup_intr(void); 374 extern int pm_adjust_timestamps(dev_info_t *, void *); 375 extern void start_other_cpus(int); 376 extern void sysevent_evc_thrinit(); 377 extern void lgrp_main_init(void); 378 extern void lgrp_main_mp_init(void); 379 #if defined(__x86) 380 extern void cpupm_post_startup(void); 381 #endif 382 /* 383 * In the horrible world of x86 in-lines, you can't get symbolic 384 * structure offsets a la genassym. This assertion is here so 385 * that the next poor slob who innocently changes the offset of 386 * cpu_thread doesn't waste as much time as I just did finding 387 * out that it's hard-coded in i86/ml/i86.il. Similarly for 388 * curcpup. You're welcome. 389 */ 390 ASSERT(CPU == CPU->cpu_self); 391 ASSERT(curthread == CPU->cpu_thread); 392 ASSERT_STACK_ALIGNED(); 393 394 /* 395 * Setup the first lgroup, and home t0 396 */ 397 lgrp_setup(); 398 399 /* 400 * Once 'startup()' completes, the thread_reaper() daemon would be 401 * created(in thread_init()). After that, it is safe to create threads 402 * that could exit. These exited threads will get reaped. 403 */ 404 startup(); 405 segkmem_gc(); 406 callb_init(); 407 cbe_init_pre(); /* x86 must initialize gethrtimef before timer_init */ 408 timer_init(); /* timer must be initialized before cyclic starts */ 409 cbe_init(); 410 callout_init(); /* callout table MUST be init'd after cyclics */ 411 clock_tick_init_pre(); 412 clock_init(); 413 414 /* 415 * On some platforms, clkinitf() changes the timing source that 416 * gethrtime_unscaled() uses to generate timestamps. cbe_init() calls 417 * clkinitf(), so re-initialize the microstate counters after the 418 * timesource has been chosen. 419 */ 420 init_mstate(&t0, LMS_SYSTEM); 421 init_cpu_mstate(CPU, CMS_SYSTEM); 422 423 /* 424 * May need to probe to determine latencies from CPU 0 after 425 * gethrtime() comes alive in cbe_init() and before enabling interrupts 426 */ 427 lgrp_plat_probe(); 428 429 /* 430 * Call all system initialization functions. 431 */ 432 for (initptr = &init_tbl[0]; *initptr; initptr++) 433 (**initptr)(); 434 /* 435 * Load iSCSI boot properties 436 */ 437 ld_ib_prop(); 438 /* 439 * initialize vm related stuff. 440 */ 441 vm_init(); 442 443 /* 444 * initialize buffer pool for raw I/O requests 445 */ 446 physio_bufs_init(); 447 448 ttolwp(curthread)->lwp_error = 0; /* XXX kludge for SCSI driver */ 449 450 /* 451 * Drop the interrupt level and allow interrupts. At this point 452 * the DDI guarantees that interrupts are enabled. 453 */ 454 (void) spl0(); 455 interrupts_unleashed = 1; 456 457 #if defined(__i386) || defined(__amd64) 458 /* 459 * add intel iommu fault event handler 460 */ 461 rootnex_iommu_add_intr(); 462 #endif 463 464 vfs_mountroot(); /* Mount the root file system */ 465 errorq_init(); /* after vfs_mountroot() so DDI root is ready */ 466 cpu_kstat_init(CPU); /* after vfs_mountroot() so TOD is valid */ 467 ddi_walk_devs(ddi_root_node(), pm_adjust_timestamps, NULL); 468 /* after vfs_mountroot() so hrestime is valid */ 469 470 post_startup(); 471 swaploaded = 1; 472 473 /* 474 * Initialize Solaris Audit Subsystem 475 */ 476 audit_init(); 477 478 /* 479 * Plumb the protocol modules and drivers only if we are not 480 * networked booted, in this case we already did it in rootconf(). 481 */ 482 if (netboot == 0 && iscsiboot_prop == NULL) 483 (void) strplumb(); 484 485 gethrestime(&PTOU(curproc)->u_start); 486 curthread->t_start = PTOU(curproc)->u_start.tv_sec; 487 p->p_mstart = gethrtime(); 488 489 /* 490 * Perform setup functions that can only be done after root 491 * and swap have been set up. 492 */ 493 consconfig(); 494 release_bootstrap(); 495 496 /* 497 * attach drivers with ddi-forceattach prop 498 * This must be done after consconfig() to prevent usb key/mouse 499 * from attaching before the upper console stream is plumbed. 500 * It must be done early enough to load hotplug drivers (e.g. 501 * pcmcia nexus) so that devices enumerated via hotplug is 502 * available before I/O subsystem is fully initialized. 503 */ 504 i_ddi_forceattach_drivers(); 505 506 /* 507 * Set the scan rate and other parameters of the paging subsystem. 508 */ 509 setupclock(0); 510 511 /* 512 * Create kmem cache for proc structures 513 */ 514 process_cache = kmem_cache_create("process_cache", sizeof (proc_t), 515 0, NULL, NULL, NULL, NULL, NULL, 0); 516 517 /* 518 * Initialize process 0's lwp directory and lwpid hash table. 519 */ 520 p->p_lwpdir = p->p_lwpfree = p0_lwpdir; 521 p->p_lwpdir->ld_next = p->p_lwpdir + 1; 522 p->p_lwpdir_sz = 2; 523 p->p_tidhash = p0_tidhash; 524 p->p_tidhash_sz = 2; 525 p0_lep.le_thread = curthread; 526 p0_lep.le_lwpid = curthread->t_tid; 527 p0_lep.le_start = curthread->t_start; 528 lwp_hash_in(p, &p0_lep); 529 530 /* 531 * Initialize extended accounting. 532 */ 533 exacct_init(); 534 535 /* 536 * Initialize threads of sysevent event channels 537 */ 538 sysevent_evc_thrinit(); 539 540 /* 541 * main lgroup initialization 542 * This must be done after post_startup(), but before 543 * start_other_cpus() 544 */ 545 lgrp_main_init(); 546 547 /* 548 * Perform MP initialization, if any. 549 */ 550 start_other_cpus(0); 551 552 /* 553 * Finish lgrp initialization after all CPUS are brought online. 554 */ 555 lgrp_main_mp_init(); 556 557 /* 558 * Initialize lib_va arenas. Needs to be done after start_other_cpus 559 * so that USERLIMIT32 represents the final value after any 560 * workarounds have been applied. Also need to be done before we 561 * create any processes so that all libs can be cached. 562 */ 563 lib_va_init(); 564 565 /* 566 * After mp_init(), number of cpus are known (this is 567 * true for the time being, when there are actually 568 * hot pluggable cpus then this scheme would not do). 569 * Any per cpu initialization is done here. 570 */ 571 kmem_mp_init(); 572 vmem_update(NULL); 573 574 clock_tick_init_post(); 575 576 for (initptr = &mp_init_tbl[0]; *initptr; initptr++) 577 (**initptr)(); 578 579 /* 580 * These must be called after start_other_cpus 581 */ 582 pm_cfb_setup_intr(); 583 #if defined(__x86) 584 cpupm_post_startup(); 585 #endif 586 587 /* 588 * Make init process; enter scheduling loop with system process. 589 */ 590 591 /* create init process */ 592 if (newproc(start_init, NULL, defaultcid, 59, NULL)) 593 panic("main: unable to fork init."); 594 595 /* create pageout daemon */ 596 if (newproc(pageout, NULL, syscid, maxclsyspri - 1, NULL)) 597 panic("main: unable to fork pageout()"); 598 599 /* create fsflush daemon */ 600 if (newproc(fsflush, NULL, syscid, minclsyspri, NULL)) 601 panic("main: unable to fork fsflush()"); 602 603 /* create cluster process if we're a member of one */ 604 if (cluster_bootflags & CLUSTER_BOOTED) { 605 if (newproc(cluster_wrapper, NULL, syscid, minclsyspri, NULL)) 606 panic("main: unable to fork cluster()"); 607 } 608 609 /* 610 * Create system threads (threads are associated with p0) 611 */ 612 613 /* create module uninstall daemon */ 614 /* BugID 1132273. If swapping over NFS need a bigger stack */ 615 (void) thread_create(NULL, 0, (void (*)())mod_uninstall_daemon, 616 NULL, 0, &p0, TS_RUN, minclsyspri); 617 618 (void) thread_create(NULL, 0, seg_pasync_thread, 619 NULL, 0, &p0, TS_RUN, minclsyspri); 620 621 pid_setmin(); 622 623 bcopy("sched", PTOU(curproc)->u_psargs, 6); 624 bcopy("sched", PTOU(curproc)->u_comm, 5); 625 sched(); 626 /* NOTREACHED */ 627 } 628