1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * method.c - method execution functions 31 * 32 * This file contains the routines needed to run a method: a fork(2)-exec(2) 33 * invocation monitored using either the contract filesystem or waitpid(2). 34 * (Plain fork1(2) support is provided in fork.c.) 35 * 36 * Contract Transfer 37 * When we restart a service, we want to transfer any contracts that the old 38 * service's contract inherited. This means that (a) we must not abandon the 39 * old contract when the service dies and (b) we must write the id of the old 40 * contract into the terms of the new contract. There should be limits to 41 * (a), though, since we don't want to keep the contract around forever. To 42 * this end we'll say that services in the offline state may have a contract 43 * to be transfered and services in the disabled or maintenance states cannot. 44 * This means that when a service transitions from online (or degraded) to 45 * offline, the contract should be preserved, and when the service transitions 46 * from offline to online (i.e., the start method), we'll transfer inherited 47 * contracts. 48 */ 49 50 #include <sys/contract/process.h> 51 #include <sys/ctfs.h> 52 #include <sys/stat.h> 53 #include <sys/time.h> 54 #include <sys/types.h> 55 #include <sys/uio.h> 56 #include <sys/wait.h> 57 #include <alloca.h> 58 #include <assert.h> 59 #include <errno.h> 60 #include <fcntl.h> 61 #include <libcontract.h> 62 #include <libcontract_priv.h> 63 #include <libgen.h> 64 #include <librestart.h> 65 #include <libscf.h> 66 #include <limits.h> 67 #include <port.h> 68 #include <sac.h> 69 #include <signal.h> 70 #include <stdlib.h> 71 #include <string.h> 72 #include <strings.h> 73 #include <unistd.h> 74 75 #include "startd.h" 76 77 #define SBIN_SH "/sbin/sh" 78 79 /* 80 * Mapping from restart_on method-type to contract events. Must correspond to 81 * enum method_restart_t. 82 */ 83 static uint_t method_events[] = { 84 /* METHOD_RESTART_ALL */ 85 CT_PR_EV_HWERR | CT_PR_EV_SIGNAL | CT_PR_EV_CORE | CT_PR_EV_EMPTY, 86 /* METHOD_RESTART_EXTERNAL_FAULT */ 87 CT_PR_EV_HWERR | CT_PR_EV_SIGNAL, 88 /* METHOD_RESTART_ANY_FAULT */ 89 CT_PR_EV_HWERR | CT_PR_EV_SIGNAL | CT_PR_EV_CORE 90 }; 91 92 /* 93 * method_record_start(restarter_inst_t *) 94 * Record a service start for rate limiting. Place the current time 95 * in the circular array of instance starts. 96 */ 97 static void 98 method_record_start(restarter_inst_t *inst) 99 { 100 int index = inst->ri_start_index++ % RINST_START_TIMES; 101 102 inst->ri_start_time[index] = gethrtime(); 103 } 104 105 /* 106 * method_rate_critical(restarter_inst_t *) 107 * Return true if the average start interval is less than the permitted 108 * interval. Implicit success if insufficient measurements for an 109 * average exist. 110 */ 111 static int 112 method_rate_critical(restarter_inst_t *inst) 113 { 114 uint_t n = inst->ri_start_index; 115 hrtime_t avg_ns = 0; 116 117 if (inst->ri_start_index < RINST_START_TIMES) 118 return (0); 119 120 avg_ns = 121 (inst->ri_start_time[(n - 1) % RINST_START_TIMES] - 122 inst->ri_start_time[n % RINST_START_TIMES]) / 123 (RINST_START_TIMES - 1); 124 125 return (avg_ns < RINST_FAILURE_RATE_NS); 126 } 127 128 /* 129 * int method_is_transient() 130 * Determine if the method for the given instance is transient, 131 * from a contract perspective. Return 1 if it is, and 0 if it isn't. 132 */ 133 static int 134 method_is_transient(restarter_inst_t *inst, int type) 135 { 136 if (instance_is_transient_style(inst) || type != METHOD_START) 137 return (1); 138 else 139 return (0); 140 } 141 142 /* 143 * void method_store_contract() 144 * Store the newly created contract id into local structures and 145 * the repository. If the repository connection is broken it is rebound. 146 */ 147 static void 148 method_store_contract(restarter_inst_t *inst, int type, ctid_t *cid) 149 { 150 int r; 151 boolean_t primary; 152 153 if (errno = contract_latest(cid)) 154 uu_die("%s: Couldn't get new contract's id", inst->ri_i.i_fmri); 155 156 primary = !method_is_transient(inst, type); 157 158 if (!primary) { 159 if (inst->ri_i.i_transient_ctid != 0) { 160 log_framework(LOG_INFO, 161 "%s: transient ctid expected to be 0 but " 162 "was set to %ld\n", inst->ri_i.i_fmri, 163 inst->ri_i.i_transient_ctid); 164 } 165 166 inst->ri_i.i_transient_ctid = *cid; 167 } else { 168 if (inst->ri_i.i_primary_ctid != 0) { 169 /* 170 * There was an old contract that we transferred. 171 * Remove it. 172 */ 173 method_remove_contract(inst, B_TRUE, B_FALSE); 174 } 175 176 if (inst->ri_i.i_primary_ctid != 0) { 177 log_framework(LOG_INFO, 178 "%s: primary ctid expected to be 0 but " 179 "was set to %ld\n", inst->ri_i.i_fmri, 180 inst->ri_i.i_primary_ctid); 181 } 182 183 inst->ri_i.i_primary_ctid = *cid; 184 inst->ri_i.i_primary_ctid_stopped = 0; 185 186 contract_hash_store(*cid, inst->ri_id); 187 } 188 189 again: 190 if (inst->ri_mi_deleted) 191 return; 192 193 r = restarter_store_contract(inst->ri_m_inst, *cid, primary ? 194 RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT); 195 switch (r) { 196 case 0: 197 break; 198 199 case ECANCELED: 200 inst->ri_mi_deleted = B_TRUE; 201 break; 202 203 case ECONNABORTED: 204 libscf_handle_rebind(scf_instance_handle(inst->ri_m_inst)); 205 /* FALLTHROUGH */ 206 207 case EBADF: 208 libscf_reget_instance(inst); 209 goto again; 210 211 case ENOMEM: 212 case EPERM: 213 case EACCES: 214 case EROFS: 215 uu_die("%s: Couldn't store contract id %ld", 216 inst->ri_i.i_fmri, *cid); 217 /* NOTREACHED */ 218 219 case EINVAL: 220 default: 221 bad_error("restarter_store_contract", r); 222 } 223 } 224 225 /* 226 * void method_remove_contract() 227 * Remove any non-permanent contracts from internal structures and 228 * the repository, then abandon them. 229 * Returns 230 * 0 - success 231 * ECANCELED - inst was deleted from the repository 232 * 233 * If the repository connection was broken, it is rebound. 234 */ 235 void 236 method_remove_contract(restarter_inst_t *inst, boolean_t primary, 237 boolean_t abandon) 238 { 239 ctid_t * const ctidp = primary ? &inst->ri_i.i_primary_ctid : 240 &inst->ri_i.i_transient_ctid; 241 242 int r; 243 244 assert(*ctidp != 0); 245 246 log_framework(LOG_DEBUG, "Removing %s contract %lu for %s.\n", 247 primary ? "primary" : "transient", *ctidp, inst->ri_i.i_fmri); 248 249 if (abandon) 250 contract_abandon(*ctidp); 251 252 again: 253 if (inst->ri_mi_deleted) { 254 r = ECANCELED; 255 goto out; 256 } 257 258 r = restarter_remove_contract(inst->ri_m_inst, *ctidp, primary ? 259 RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT); 260 switch (r) { 261 case 0: 262 break; 263 264 case ECANCELED: 265 inst->ri_mi_deleted = B_TRUE; 266 break; 267 268 case ECONNABORTED: 269 libscf_handle_rebind(scf_instance_handle(inst->ri_m_inst)); 270 /* FALLTHROUGH */ 271 272 case EBADF: 273 libscf_reget_instance(inst); 274 goto again; 275 276 case ENOMEM: 277 case EPERM: 278 case EACCES: 279 case EROFS: 280 log_error(LOG_INFO, "%s: Couldn't remove contract id %ld: " 281 "%s.\n", inst->ri_i.i_fmri, *ctidp, strerror(r)); 282 break; 283 284 case EINVAL: 285 default: 286 bad_error("restarter_remove_contract", r); 287 } 288 289 out: 290 if (primary) 291 contract_hash_remove(*ctidp); 292 293 *ctidp = 0; 294 } 295 296 /* 297 * int method_ready_contract(restarter_inst_t *, int, method_restart_t, int) 298 * 299 * Activate a contract template for the type method of inst. type, 300 * restart_on, and cte_mask dictate the critical events term of the contract. 301 * Returns 302 * 0 - success 303 * ECANCELED - inst has been deleted from the repository 304 */ 305 static int 306 method_ready_contract(restarter_inst_t *inst, int type, 307 method_restart_t restart_on, uint_t cte_mask) 308 { 309 int tmpl, err, istrans, iswait, ret; 310 uint_t cevents, fevents; 311 312 /* 313 * Correctly supporting wait-style services is tricky without 314 * rearchitecting startd to cope with multiple event sources 315 * simultaneously trying to stop an instance. Until a better 316 * solution is implemented, we avoid this problem for 317 * wait-style services by making contract events fatal and 318 * letting the wait code alone handle stopping the service. 319 */ 320 iswait = instance_is_wait_style(inst); 321 istrans = method_is_transient(inst, type); 322 323 tmpl = open64(CTFS_ROOT "/process/template", O_RDWR); 324 if (tmpl == -1) 325 uu_die("Could not create contract template"); 326 327 /* 328 * We assume non-login processes are unlikely to create 329 * multiple process groups, and set CT_PR_PGRPONLY for all 330 * wait-style services' contracts. 331 */ 332 err = ct_pr_tmpl_set_param(tmpl, CT_PR_INHERIT | CT_PR_REGENT | 333 (iswait ? CT_PR_PGRPONLY : 0)); 334 assert(err == 0); 335 336 if (istrans) { 337 cevents = 0; 338 fevents = 0; 339 } else { 340 assert(restart_on >= 0); 341 assert(restart_on <= METHOD_RESTART_ANY_FAULT); 342 cevents = method_events[restart_on] & ~cte_mask; 343 fevents = iswait ? 344 (method_events[restart_on] & ~cte_mask & CT_PR_ALLFATAL) : 345 0; 346 } 347 348 err = ct_tmpl_set_critical(tmpl, cevents); 349 assert(err == 0); 350 351 err = ct_tmpl_set_informative(tmpl, 0); 352 assert(err == 0); 353 err = ct_pr_tmpl_set_fatal(tmpl, fevents); 354 assert(err == 0); 355 356 err = ct_tmpl_set_cookie(tmpl, istrans ? METHOD_OTHER_COOKIE : 357 METHOD_START_COOKIE); 358 assert(err == 0); 359 360 if (type == METHOD_START && inst->ri_i.i_primary_ctid != 0) { 361 ret = ct_pr_tmpl_set_transfer(tmpl, inst->ri_i.i_primary_ctid); 362 switch (ret) { 363 case 0: 364 break; 365 366 case ENOTEMPTY: 367 /* No contracts for you! */ 368 method_remove_contract(inst, B_TRUE, B_TRUE); 369 if (inst->ri_mi_deleted) { 370 ret = ECANCELED; 371 goto out; 372 } 373 break; 374 375 case EINVAL: 376 case ESRCH: 377 case EACCES: 378 default: 379 bad_error("ct_pr_tmpl_set_transfer", ret); 380 } 381 } 382 383 err = ct_tmpl_activate(tmpl); 384 assert(err == 0); 385 386 ret = 0; 387 388 out: 389 err = close(tmpl); 390 assert(err == 0); 391 392 return (ret); 393 } 394 395 static const char *method_names[] = { "start", "stop", "refresh" }; 396 397 static void 398 exec_method(const restarter_inst_t *inst, int type, const char *method, 399 struct method_context *mcp, uint8_t need_session) 400 { 401 char *cmd; 402 const char *errf; 403 char **nenv; 404 405 cmd = uu_msprintf("exec %s", method); 406 407 if (inst->ri_utmpx_prefix[0] != '\0' && inst->ri_utmpx_prefix != NULL) 408 (void) utmpx_mark_init(getpid(), inst->ri_utmpx_prefix); 409 410 setlog(inst->ri_logstem); 411 log_instance(inst, B_FALSE, "Executing %s method (\"%s\")", 412 method_names[type], method); 413 414 if (need_session) 415 (void) setpgrp(); 416 417 /* Set credentials. */ 418 errno = restarter_set_method_context(mcp, &errf); 419 if (errno != 0) { 420 (void) fputs("svc.startd could not set context for method: ", 421 stderr); 422 423 if (errno == -1) { 424 if (strcmp(errf, "core_set_process_path") == 0) { 425 (void) fputs("Could not set corefile path.\n", 426 stderr); 427 } else if (strcmp(errf, "setproject") == 0) { 428 (void) fprintf(stderr, "%s: a resource control " 429 "assignment failed\n", errf); 430 } else if (strcmp(errf, "pool_set_binding") == 0) { 431 (void) fprintf(stderr, "%s: a system error " 432 "occurred\n", errf); 433 } else { 434 #ifndef NDEBUG 435 uu_warn("%s:%d: Bad function name \"%s\" for " 436 "error %d from " 437 "restarter_set_method_context().\n", 438 __FILE__, __LINE__, errf, errno); 439 #endif 440 abort(); 441 } 442 443 exit(1); 444 } 445 446 if (errf != NULL && strcmp(errf, "pool_set_binding") == 0) { 447 switch (errno) { 448 case ENOENT: 449 (void) fprintf(stderr, "%s: the pool could not " 450 "be found\n", errf); 451 break; 452 453 case EBADF: 454 (void) fprintf(stderr, "%s: the configuration " 455 "is invalid\n", errf); 456 break; 457 458 default: 459 #ifndef NDEBUG 460 uu_warn("%s:%d: Bad error %d for function %s " 461 "in restarter_set_method_context().\n", 462 __FILE__, __LINE__, errno, errf); 463 #endif 464 abort(); 465 } 466 467 exit(SMF_EXIT_ERR_CONFIG); 468 } 469 470 if (errf != NULL) { 471 perror(errf); 472 473 switch (errno) { 474 case EINVAL: 475 case EPERM: 476 case ENOENT: 477 case ENAMETOOLONG: 478 case ERANGE: 479 case ESRCH: 480 exit(SMF_EXIT_ERR_CONFIG); 481 /* NOTREACHED */ 482 483 default: 484 exit(1); 485 } 486 } 487 488 switch (errno) { 489 case ENOMEM: 490 (void) fputs("Out of memory.\n", stderr); 491 exit(1); 492 /* NOTREACHED */ 493 494 case ENOENT: 495 (void) fputs("Missing passwd entry for user.\n", 496 stderr); 497 exit(SMF_EXIT_ERR_CONFIG); 498 /* NOTREACHED */ 499 500 default: 501 #ifndef NDEBUG 502 uu_warn("%s:%d: Bad miscellaneous error %d from " 503 "restarter_set_method_context().\n", __FILE__, 504 __LINE__, errno); 505 #endif 506 abort(); 507 } 508 } 509 510 nenv = set_smf_env(mcp->env, mcp->env_sz, NULL, inst, method); 511 512 log_preexec(); 513 514 (void) execle(SBIN_SH, SBIN_SH, "-c", cmd, NULL, nenv); 515 516 exit(10); 517 } 518 519 static void 520 write_status(restarter_inst_t *inst, const char *mname, int stat) 521 { 522 int r; 523 524 again: 525 if (inst->ri_mi_deleted) 526 return; 527 528 r = libscf_write_method_status(inst->ri_m_inst, mname, stat); 529 switch (r) { 530 case 0: 531 break; 532 533 case ECONNABORTED: 534 libscf_reget_instance(inst); 535 goto again; 536 537 case ECANCELED: 538 inst->ri_mi_deleted = 1; 539 break; 540 541 case EPERM: 542 case EACCES: 543 case EROFS: 544 log_framework(LOG_INFO, "Could not write exit status " 545 "for %s method of %s: %s.\n", mname, 546 inst->ri_i.i_fmri, strerror(r)); 547 break; 548 549 case ENAMETOOLONG: 550 default: 551 bad_error("libscf_write_method_status", r); 552 } 553 } 554 555 /* 556 * int method_run() 557 * Execute the type method of instp. If it requires a fork(), wait for it 558 * to return and return its exit code in *exit_code. Otherwise set 559 * *exit_code to 0 if the method succeeds & -1 if it fails. If the 560 * repository connection is broken, it is rebound, but inst may not be 561 * reset. 562 * Returns 563 * 0 - success 564 * EINVAL - A correct method or method context couldn't be retrieved. 565 * EIO - Contract kill failed. 566 * EFAULT - Method couldn't be executed successfully. 567 * ELOOP - Retry threshold exceeded. 568 * ECANCELED - inst was deleted from the repository before method was run 569 * ERANGE - Timeout retry threshold exceeded. 570 * EAGAIN - Failed due to external cause, retry. 571 */ 572 int 573 method_run(restarter_inst_t **instp, int type, int *exit_code) 574 { 575 char *method; 576 int ret_status; 577 pid_t pid; 578 method_restart_t restart_on; 579 uint_t cte_mask; 580 uint8_t need_session; 581 scf_handle_t *h; 582 scf_snapshot_t *snap; 583 const char *mname; 584 const char *errstr; 585 struct method_context *mcp; 586 int result = 0, timeout_fired = 0; 587 int sig, r; 588 boolean_t transient; 589 uint64_t timeout; 590 uint8_t timeout_retry; 591 ctid_t ctid; 592 int ctfd = -1; 593 ct_evthdl_t ctev; 594 uint_t evtype; 595 restarter_inst_t *inst = *instp; 596 int id = inst->ri_id; 597 598 assert(PTHREAD_MUTEX_HELD(&inst->ri_lock)); 599 assert(instance_in_transition(inst)); 600 601 if (inst->ri_mi_deleted) 602 return (ECANCELED); 603 604 *exit_code = 0; 605 606 assert(0 <= type && type <= 2); 607 mname = method_names[type]; 608 609 if (type == METHOD_START) 610 inst->ri_pre_online_hook(); 611 612 h = scf_instance_handle(inst->ri_m_inst); 613 614 snap = scf_snapshot_create(h); 615 if (snap == NULL || 616 scf_instance_get_snapshot(inst->ri_m_inst, "running", snap) != 0) { 617 log_framework(LOG_DEBUG, 618 "Could not get running snapshot for %s. " 619 "Using editing version to run method %s.\n", 620 inst->ri_i.i_fmri, mname); 621 scf_snapshot_destroy(snap); 622 snap = NULL; 623 } 624 625 /* 626 * After this point, we may be logging to the instance log. 627 * Make sure we've noted where that log is as a property of 628 * the instance. 629 */ 630 r = libscf_note_method_log(inst->ri_m_inst, st->st_log_prefix, 631 inst->ri_logstem); 632 if (r != 0) { 633 log_framework(LOG_WARNING, 634 "%s: couldn't note log location: %s\n", 635 inst->ri_i.i_fmri, strerror(r)); 636 } 637 638 if ((method = libscf_get_method(h, type, inst, snap, &restart_on, 639 &cte_mask, &need_session, &timeout, &timeout_retry)) == NULL) { 640 if (errno == LIBSCF_PGROUP_ABSENT) { 641 log_framework(LOG_DEBUG, 642 "%s: instance has no method property group '%s'.\n", 643 inst->ri_i.i_fmri, mname); 644 if (type == METHOD_REFRESH) 645 log_instance(inst, B_TRUE, "No '%s' method " 646 "defined. Treating as :true.", mname); 647 else 648 log_instance(inst, B_TRUE, "Method property " 649 "group '%s' is not present.", mname); 650 scf_snapshot_destroy(snap); 651 return (0); 652 } else if (errno == LIBSCF_PROPERTY_ABSENT) { 653 log_framework(LOG_DEBUG, 654 "%s: instance has no '%s/exec' method property.\n", 655 inst->ri_i.i_fmri, mname); 656 log_instance(inst, B_TRUE, "Method property '%s/exec " 657 "is not present.", mname); 658 scf_snapshot_destroy(snap); 659 return (0); 660 } else { 661 log_error(LOG_WARNING, 662 "%s: instance libscf_get_method failed\n", 663 inst->ri_i.i_fmri); 664 scf_snapshot_destroy(snap); 665 return (EINVAL); 666 } 667 } 668 669 /* open service contract if stopping a non-transient service */ 670 if (type == METHOD_STOP && (!instance_is_transient_style(inst))) { 671 if (inst->ri_i.i_primary_ctid == 0) { 672 /* service is not running, nothing to stop */ 673 log_framework(LOG_DEBUG, "%s: instance has no primary " 674 "contract, no service to stop.\n", 675 inst->ri_i.i_fmri); 676 scf_snapshot_destroy(snap); 677 return (0); 678 } 679 if ((ctfd = contract_open(inst->ri_i.i_primary_ctid, "process", 680 "events", O_RDONLY)) < 0) { 681 result = EFAULT; 682 log_instance(inst, B_TRUE, "Could not open service " 683 "contract %ld. Stop method not run.\n", 684 inst->ri_i.i_primary_ctid); 685 goto out; 686 } 687 } 688 689 if (restarter_is_null_method(method)) { 690 log_framework(LOG_DEBUG, "%s: null method succeeds\n", 691 inst->ri_i.i_fmri); 692 693 log_instance(inst, B_TRUE, "Executing %s method (null)", mname); 694 695 if (type == METHOD_START) 696 write_status(inst, mname, 0); 697 goto out; 698 } 699 700 sig = restarter_is_kill_method(method); 701 if (sig >= 0) { 702 703 if (inst->ri_i.i_primary_ctid == 0) { 704 log_error(LOG_ERR, "%s: :kill with no contract\n", 705 inst->ri_i.i_fmri); 706 result = EINVAL; 707 goto out; 708 } 709 710 log_framework(LOG_DEBUG, 711 "%s: :killing contract with signal %d\n", 712 inst->ri_i.i_fmri, sig); 713 714 log_instance(inst, B_TRUE, "Executing %s method (:kill)", 715 mname); 716 717 if (contract_kill(inst->ri_i.i_primary_ctid, sig, 718 inst->ri_i.i_fmri) != 0) { 719 result = EIO; 720 goto out; 721 } else 722 goto assured_kill; 723 } 724 725 log_framework(LOG_DEBUG, "%s: forking to run method %s\n", 726 inst->ri_i.i_fmri, method); 727 728 errstr = restarter_get_method_context(RESTARTER_METHOD_CONTEXT_VERSION, 729 inst->ri_m_inst, snap, mname, method, &mcp); 730 731 if (errstr != NULL) { 732 log_error(LOG_WARNING, "%s: %s\n", inst->ri_i.i_fmri, errstr); 733 result = EINVAL; 734 goto out; 735 } 736 737 r = method_ready_contract(inst, type, restart_on, cte_mask); 738 if (r != 0) { 739 assert(r == ECANCELED); 740 assert(inst->ri_mi_deleted); 741 restarter_free_method_context(mcp); 742 result = ECANCELED; 743 goto out; 744 } 745 746 /* 747 * Validate safety of method contexts, to save children work. 748 */ 749 if (!restarter_rm_libs_loadable()) 750 log_framework(LOG_DEBUG, "%s: method contexts limited " 751 "to root-accessible libraries\n", inst->ri_i.i_fmri); 752 753 /* 754 * If the service is restarting too quickly, send it to 755 * maintenance. 756 */ 757 if (type == METHOD_START) { 758 method_record_start(inst); 759 if (method_rate_critical(inst)) { 760 log_instance(inst, B_TRUE, "Restarting too quickly, " 761 "changing state to maintenance"); 762 result = ELOOP; 763 goto out; 764 } 765 } 766 767 pid = startd_fork1(NULL); 768 if (pid == 0) 769 exec_method(inst, type, method, mcp, need_session); 770 771 if (pid == -1) { 772 log_error(LOG_WARNING, 773 "%s: Couldn't fork to execute method %s\n", 774 inst->ri_i.i_fmri, method); 775 result = EFAULT; 776 goto out; 777 } 778 779 restarter_free_method_context(mcp); 780 781 /* 782 * Get the contract id, decide whether it is primary or transient, and 783 * stash it in inst & the repository. 784 */ 785 method_store_contract(inst, type, &ctid); 786 787 /* 788 * Similarly for the start method PID. 789 */ 790 if (type == METHOD_START && !inst->ri_mi_deleted) 791 (void) libscf_write_start_pid(inst->ri_m_inst, pid); 792 793 if (instance_is_wait_style(inst) && type == METHOD_START) { 794 /* Wait style instances don't get timeouts on start methods. */ 795 if (wait_register(pid, inst->ri_i.i_fmri, 1, 0)) { 796 log_error(LOG_WARNING, 797 "%s: couldn't register %ld for wait\n", 798 inst->ri_i.i_fmri, pid); 799 result = EFAULT; 800 goto contract_out; 801 } 802 write_status(inst, mname, 0); 803 804 } else { 805 int r, err; 806 time_t start_time; 807 time_t end_time; 808 809 /* 810 * Because on upgrade/live-upgrade we may have no chance 811 * to override faulty timeout values on the way to 812 * manifest import, all services on the path to manifest 813 * import are treated the same as INFINITE timeout services. 814 */ 815 816 start_time = time(NULL); 817 if (timeout != METHOD_TIMEOUT_INFINITE && !is_timeout_ovr(inst)) 818 timeout_insert(inst, ctid, timeout); 819 else 820 timeout = METHOD_TIMEOUT_INFINITE; 821 822 /* Unlock the instance while waiting for the method. */ 823 MUTEX_UNLOCK(&inst->ri_lock); 824 825 do 826 r = waitpid(pid, &ret_status, NULL); 827 while (r == -1 && errno == EINTR); 828 if (r == -1) 829 err = errno; 830 831 /* Re-grab the lock. */ 832 inst = inst_lookup_by_id(id); 833 834 /* 835 * inst can't be removed, as the removal thread waits 836 * for completion of this one. 837 */ 838 assert(inst != NULL); 839 *instp = inst; 840 841 if (inst->ri_timeout != NULL && inst->ri_timeout->te_fired) 842 timeout_fired = 1; 843 844 timeout_remove(inst, ctid); 845 846 log_framework(LOG_DEBUG, 847 "%s method for %s exited with status %d.\n", mname, 848 inst->ri_i.i_fmri, WEXITSTATUS(ret_status)); 849 850 if (r == -1) { 851 log_error(LOG_WARNING, 852 "Couldn't waitpid() for %s method of %s (%s).\n", 853 mname, inst->ri_i.i_fmri, strerror(err)); 854 result = EFAULT; 855 goto contract_out; 856 } 857 858 if (type == METHOD_START) 859 write_status(inst, mname, ret_status); 860 861 /* return ERANGE if this service doesn't retry on timeout */ 862 if (timeout_fired == 1 && timeout_retry == 0) { 863 result = ERANGE; 864 goto contract_out; 865 } 866 867 if (!WIFEXITED(ret_status)) { 868 /* 869 * If method didn't exit itself (it was killed by an 870 * external entity, etc.), consider the entire 871 * method_run as failed. 872 */ 873 if (WIFSIGNALED(ret_status)) { 874 char buf[SIG2STR_MAX]; 875 (void) sig2str(WTERMSIG(ret_status), buf); 876 877 log_error(LOG_WARNING, "%s: Method \"%s\" " 878 "failed due to signal %s.\n", 879 inst->ri_i.i_fmri, method, buf); 880 log_instance(inst, B_TRUE, "Method \"%s\" " 881 "failed due to signal %s", mname, buf); 882 } else { 883 log_error(LOG_WARNING, "%s: Method \"%s\" " 884 "failed with exit status %d.\n", 885 inst->ri_i.i_fmri, method, 886 WEXITSTATUS(ret_status)); 887 log_instance(inst, B_TRUE, "Method \"%s\" " 888 "failed with exit status %d", mname, 889 WEXITSTATUS(ret_status)); 890 } 891 result = EAGAIN; 892 goto contract_out; 893 } 894 895 *exit_code = WEXITSTATUS(ret_status); 896 if (*exit_code != 0) { 897 log_error(LOG_WARNING, 898 "%s: Method \"%s\" failed with exit status %d.\n", 899 inst->ri_i.i_fmri, method, WEXITSTATUS(ret_status)); 900 } 901 902 log_instance(inst, B_TRUE, "Method \"%s\" exited with status " 903 "%d", mname, *exit_code); 904 905 if (*exit_code != 0) 906 goto contract_out; 907 908 end_time = time(NULL); 909 910 /* Give service contract remaining seconds to empty */ 911 if (timeout != METHOD_TIMEOUT_INFINITE) 912 timeout -= (end_time - start_time); 913 } 914 915 assured_kill: 916 /* 917 * For stop methods, assure that the service contract has emptied 918 * before returning. 919 */ 920 if (type == METHOD_STOP && (!instance_is_transient_style(inst)) && 921 !(contract_is_empty(inst->ri_i.i_primary_ctid))) { 922 923 if (timeout != METHOD_TIMEOUT_INFINITE) 924 timeout_insert(inst, inst->ri_i.i_primary_ctid, 925 timeout); 926 927 for (;;) { 928 do { 929 r = ct_event_read_critical(ctfd, &ctev); 930 } while (r == EINTR); 931 if (r != 0) 932 break; 933 934 evtype = ct_event_get_type(ctev); 935 ct_event_free(ctev); 936 if (evtype == CT_PR_EV_EMPTY) 937 break; 938 } 939 if (r) { 940 result = EFAULT; 941 log_instance(inst, B_TRUE, "Error reading service " 942 "contract %ld.\n", inst->ri_i.i_primary_ctid); 943 } 944 945 if (timeout != METHOD_TIMEOUT_INFINITE) 946 if (inst->ri_timeout->te_fired) 947 result = EFAULT; 948 949 timeout_remove(inst, inst->ri_i.i_primary_ctid); 950 } 951 952 contract_out: 953 /* Abandon contracts for transient methods & methods that fail. */ 954 transient = method_is_transient(inst, type); 955 if ((transient || *exit_code != 0 || result != 0) && 956 (restarter_is_kill_method(method) < 0)) 957 method_remove_contract(inst, !transient, B_TRUE); 958 959 out: 960 if (ctfd >= 0) 961 (void) close(ctfd); 962 scf_snapshot_destroy(snap); 963 free(method); 964 return (result); 965 } 966 967 /* 968 * The method thread executes a service method to effect a state transition. 969 * The next_state of info->sf_id should be non-_NONE on entrance, and it will 970 * be _NONE on exit (state will either be what next_state was (on success), or 971 * it will be _MAINT (on error)). 972 * 973 * There are six classes of methods to consider: start & other (stop, refresh) 974 * for each of "normal" services, wait services, and transient services. For 975 * each, the method must be fetched from the repository & executed. fork()ed 976 * methods must be waited on, except for the start method of wait services 977 * (which must be registered with the wait subsystem via wait_register()). If 978 * the method succeeded (returned 0), then for start methods its contract 979 * should be recorded as the primary contract for the service. For other 980 * methods, it should be abandoned. If the method fails, then depending on 981 * the failure, either the method should be reexecuted or the service should 982 * be put into maintenance. Either way the contract should be abandoned. 983 */ 984 void * 985 method_thread(void *arg) 986 { 987 fork_info_t *info = arg; 988 restarter_inst_t *inst; 989 scf_handle_t *local_handle; 990 scf_instance_t *s_inst = NULL; 991 int r, exit_code; 992 boolean_t retryable; 993 const char *aux; 994 995 assert(0 <= info->sf_method_type && info->sf_method_type <= 2); 996 997 /* Get (and lock) the restarter_inst_t. */ 998 inst = inst_lookup_by_id(info->sf_id); 999 1000 assert(inst->ri_method_thread != 0); 1001 assert(instance_in_transition(inst) == 1); 1002 1003 /* 1004 * We cannot leave this function with inst in transition, because 1005 * protocol.c withholds messages for inst otherwise. 1006 */ 1007 1008 log_framework(LOG_DEBUG, "method_thread() running %s method for %s.\n", 1009 method_names[info->sf_method_type], inst->ri_i.i_fmri); 1010 1011 local_handle = libscf_handle_create_bound_loop(); 1012 1013 rebind_retry: 1014 /* get scf_instance_t */ 1015 switch (r = libscf_fmri_get_instance(local_handle, inst->ri_i.i_fmri, 1016 &s_inst)) { 1017 case 0: 1018 break; 1019 1020 case ECONNABORTED: 1021 libscf_handle_rebind(local_handle); 1022 goto rebind_retry; 1023 1024 case ENOENT: 1025 /* 1026 * It's not there, but we need to call this so protocol.c 1027 * doesn't think it's in transition anymore. 1028 */ 1029 (void) restarter_instance_update_states(local_handle, inst, 1030 inst->ri_i.i_state, RESTARTER_STATE_NONE, RERR_NONE, 1031 NULL); 1032 goto out; 1033 1034 case EINVAL: 1035 case ENOTSUP: 1036 default: 1037 bad_error("libscf_fmri_get_instance", r); 1038 } 1039 1040 inst->ri_m_inst = s_inst; 1041 inst->ri_mi_deleted = B_FALSE; 1042 1043 retry: 1044 if (info->sf_method_type == METHOD_START) 1045 log_transition(inst, START_REQUESTED); 1046 1047 r = method_run(&inst, info->sf_method_type, &exit_code); 1048 1049 if (r == 0 && exit_code == 0) { 1050 /* Success! */ 1051 assert(inst->ri_i.i_next_state != RESTARTER_STATE_NONE); 1052 1053 /* 1054 * When a stop method succeeds, remove the primary contract of 1055 * the service, unless we're going to offline, in which case 1056 * retain the contract so we can transfer inherited contracts to 1057 * the replacement service. 1058 */ 1059 1060 if (info->sf_method_type == METHOD_STOP && 1061 inst->ri_i.i_primary_ctid != 0) { 1062 if (inst->ri_i.i_next_state == RESTARTER_STATE_OFFLINE) 1063 inst->ri_i.i_primary_ctid_stopped = 1; 1064 else 1065 method_remove_contract(inst, B_TRUE, B_TRUE); 1066 } 1067 /* 1068 * We don't care whether the handle was rebound because this is 1069 * the last thing we do with it. 1070 */ 1071 (void) restarter_instance_update_states(local_handle, inst, 1072 inst->ri_i.i_next_state, RESTARTER_STATE_NONE, 1073 info->sf_event_type, NULL); 1074 1075 (void) update_fault_count(inst, FAULT_COUNT_RESET); 1076 1077 goto out; 1078 } 1079 1080 /* Failure. Retry or go to maintenance. */ 1081 1082 if (r != 0 && r != EAGAIN) { 1083 retryable = B_FALSE; 1084 } else { 1085 switch (exit_code) { 1086 case SMF_EXIT_ERR_CONFIG: 1087 case SMF_EXIT_ERR_NOSMF: 1088 case SMF_EXIT_ERR_PERM: 1089 case SMF_EXIT_ERR_FATAL: 1090 retryable = B_FALSE; 1091 break; 1092 1093 default: 1094 retryable = B_TRUE; 1095 } 1096 } 1097 1098 if (retryable && update_fault_count(inst, FAULT_COUNT_INCR) != 1) 1099 goto retry; 1100 1101 /* maintenance */ 1102 if (r == ELOOP) 1103 log_transition(inst, START_FAILED_REPEATEDLY); 1104 else if (r == ERANGE) 1105 log_transition(inst, START_FAILED_TIMEOUT_FATAL); 1106 else if (exit_code == SMF_EXIT_ERR_CONFIG) 1107 log_transition(inst, START_FAILED_CONFIGURATION); 1108 else if (exit_code == SMF_EXIT_ERR_FATAL) 1109 log_transition(inst, START_FAILED_FATAL); 1110 else 1111 log_transition(inst, START_FAILED_OTHER); 1112 1113 if (r == ELOOP) 1114 aux = "restarting_too_quickly"; 1115 else if (retryable) 1116 aux = "fault_threshold_reached"; 1117 else 1118 aux = "method_failed"; 1119 1120 (void) restarter_instance_update_states(local_handle, inst, 1121 RESTARTER_STATE_MAINT, RESTARTER_STATE_NONE, RERR_FAULT, 1122 (char *)aux); 1123 1124 if (!method_is_transient(inst, info->sf_method_type) && 1125 inst->ri_i.i_primary_ctid != 0) 1126 method_remove_contract(inst, B_TRUE, B_TRUE); 1127 1128 out: 1129 inst->ri_method_thread = 0; 1130 MUTEX_UNLOCK(&inst->ri_lock); 1131 (void) pthread_cond_broadcast(&inst->ri_method_cv); 1132 1133 scf_instance_destroy(s_inst); 1134 scf_handle_destroy(local_handle); 1135 startd_free(info, sizeof (fork_info_t)); 1136 return (NULL); 1137 } 1138