1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * method.c - method execution functions 31 * 32 * This file contains the routines needed to run a method: a fork(2)-exec(2) 33 * invocation monitored using either the contract filesystem or waitpid(2). 34 * (Plain fork1(2) support is provided in fork.c.) 35 * 36 * Contract Transfer 37 * When we restart a service, we want to transfer any contracts that the old 38 * service's contract inherited. This means that (a) we must not abandon the 39 * old contract when the service dies and (b) we must write the id of the old 40 * contract into the terms of the new contract. There should be limits to 41 * (a), though, since we don't want to keep the contract around forever. To 42 * this end we'll say that services in the offline state may have a contract 43 * to be transfered and services in the disabled or maintenance states cannot. 44 * This means that when a service transitions from online (or degraded) to 45 * offline, the contract should be preserved, and when the service transitions 46 * from offline to online (i.e., the start method), we'll transfer inherited 47 * contracts. 48 */ 49 50 #include <sys/contract/process.h> 51 #include <sys/ctfs.h> 52 #include <sys/stat.h> 53 #include <sys/time.h> 54 #include <sys/types.h> 55 #include <sys/uio.h> 56 #include <sys/wait.h> 57 #include <alloca.h> 58 #include <assert.h> 59 #include <errno.h> 60 #include <fcntl.h> 61 #include <libcontract.h> 62 #include <libcontract_priv.h> 63 #include <libgen.h> 64 #include <librestart.h> 65 #include <libscf.h> 66 #include <limits.h> 67 #include <port.h> 68 #include <sac.h> 69 #include <signal.h> 70 #include <stdlib.h> 71 #include <string.h> 72 #include <strings.h> 73 #include <unistd.h> 74 75 #include "startd.h" 76 77 #define SBIN_SH "/sbin/sh" 78 79 /* 80 * Mapping from restart_on method-type to contract events. Must correspond to 81 * enum method_restart_t. 82 */ 83 static uint_t method_events[] = { 84 /* METHOD_RESTART_ALL */ 85 CT_PR_EV_HWERR | CT_PR_EV_SIGNAL | CT_PR_EV_CORE | CT_PR_EV_EMPTY, 86 /* METHOD_RESTART_EXTERNAL_FAULT */ 87 CT_PR_EV_HWERR | CT_PR_EV_SIGNAL, 88 /* METHOD_RESTART_ANY_FAULT */ 89 CT_PR_EV_HWERR | CT_PR_EV_SIGNAL | CT_PR_EV_CORE 90 }; 91 92 /* 93 * method_record_start(restarter_inst_t *) 94 * Record a service start for rate limiting. Place the current time 95 * in the circular array of instance starts. 96 */ 97 static void 98 method_record_start(restarter_inst_t *inst) 99 { 100 int index = inst->ri_start_index++ % RINST_START_TIMES; 101 102 inst->ri_start_time[index] = gethrtime(); 103 } 104 105 /* 106 * method_rate_critical(restarter_inst_t *) 107 * Return true if the average start interval is less than the permitted 108 * interval. Implicit success if insufficient measurements for an 109 * average exist. 110 */ 111 static int 112 method_rate_critical(restarter_inst_t *inst) 113 { 114 uint_t n = inst->ri_start_index; 115 hrtime_t avg_ns = 0; 116 117 if (inst->ri_start_index < RINST_START_TIMES) 118 return (0); 119 120 avg_ns = 121 (inst->ri_start_time[(n - 1) % RINST_START_TIMES] - 122 inst->ri_start_time[n % RINST_START_TIMES]) / 123 (RINST_START_TIMES - 1); 124 125 return (avg_ns < RINST_FAILURE_RATE_NS); 126 } 127 128 /* 129 * int method_is_transient() 130 * Determine if the method for the given instance is transient, 131 * from a contract perspective. Return 1 if it is, and 0 if it isn't. 132 */ 133 static int 134 method_is_transient(restarter_inst_t *inst, int type) 135 { 136 if (instance_is_transient_style(inst) || type != METHOD_START) 137 return (1); 138 else 139 return (0); 140 } 141 142 /* 143 * void method_store_contract() 144 * Store the newly created contract id into local structures and 145 * the repository. If the repository connection is broken it is rebound. 146 */ 147 static void 148 method_store_contract(restarter_inst_t *inst, int type, ctid_t *cid) 149 { 150 int r; 151 boolean_t primary; 152 153 if (errno = contract_latest(cid)) 154 uu_die("%s: Couldn't get new contract's id", inst->ri_i.i_fmri); 155 156 primary = !method_is_transient(inst, type); 157 158 if (!primary) { 159 if (inst->ri_i.i_transient_ctid != 0) { 160 log_framework(LOG_INFO, 161 "%s: transient ctid expected to be 0 but " 162 "was set to %ld\n", inst->ri_i.i_fmri, 163 inst->ri_i.i_transient_ctid); 164 } 165 166 inst->ri_i.i_transient_ctid = *cid; 167 } else { 168 if (inst->ri_i.i_primary_ctid != 0) { 169 /* 170 * There was an old contract that we transferred. 171 * Remove it. 172 */ 173 method_remove_contract(inst, B_TRUE, B_FALSE); 174 } 175 176 if (inst->ri_i.i_primary_ctid != 0) { 177 log_framework(LOG_INFO, 178 "%s: primary ctid expected to be 0 but " 179 "was set to %ld\n", inst->ri_i.i_fmri, 180 inst->ri_i.i_primary_ctid); 181 } 182 183 inst->ri_i.i_primary_ctid = *cid; 184 inst->ri_i.i_primary_ctid_stopped = 0; 185 186 contract_hash_store(*cid, inst->ri_id); 187 } 188 189 again: 190 if (inst->ri_mi_deleted) 191 return; 192 193 r = restarter_store_contract(inst->ri_m_inst, *cid, primary ? 194 RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT); 195 switch (r) { 196 case 0: 197 break; 198 199 case ECANCELED: 200 inst->ri_mi_deleted = B_TRUE; 201 break; 202 203 case ECONNABORTED: 204 libscf_handle_rebind(scf_instance_handle(inst->ri_m_inst)); 205 /* FALLTHROUGH */ 206 207 case EBADF: 208 libscf_reget_instance(inst); 209 goto again; 210 211 case ENOMEM: 212 case EPERM: 213 case EACCES: 214 case EROFS: 215 uu_die("%s: Couldn't store contract id %ld", 216 inst->ri_i.i_fmri, *cid); 217 /* NOTREACHED */ 218 219 case EINVAL: 220 default: 221 bad_error("restarter_store_contract", r); 222 } 223 } 224 225 /* 226 * void method_remove_contract() 227 * Remove any non-permanent contracts from internal structures and 228 * the repository, then abandon them. 229 * Returns 230 * 0 - success 231 * ECANCELED - inst was deleted from the repository 232 * 233 * If the repository connection was broken, it is rebound. 234 */ 235 void 236 method_remove_contract(restarter_inst_t *inst, boolean_t primary, 237 boolean_t abandon) 238 { 239 ctid_t * const ctidp = primary ? &inst->ri_i.i_primary_ctid : 240 &inst->ri_i.i_transient_ctid; 241 242 int r; 243 244 assert(*ctidp != 0); 245 246 log_framework(LOG_DEBUG, "Removing %s contract %lu for %s.\n", 247 primary ? "primary" : "transient", *ctidp, inst->ri_i.i_fmri); 248 249 if (abandon) 250 contract_abandon(*ctidp); 251 252 again: 253 if (inst->ri_mi_deleted) { 254 r = ECANCELED; 255 goto out; 256 } 257 258 r = restarter_remove_contract(inst->ri_m_inst, *ctidp, primary ? 259 RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT); 260 switch (r) { 261 case 0: 262 break; 263 264 case ECANCELED: 265 inst->ri_mi_deleted = B_TRUE; 266 break; 267 268 case ECONNABORTED: 269 libscf_handle_rebind(scf_instance_handle(inst->ri_m_inst)); 270 /* FALLTHROUGH */ 271 272 case EBADF: 273 libscf_reget_instance(inst); 274 goto again; 275 276 case ENOMEM: 277 case EPERM: 278 case EACCES: 279 case EROFS: 280 log_error(LOG_INFO, "%s: Couldn't remove contract id %ld: " 281 "%s.\n", inst->ri_i.i_fmri, *ctidp, strerror(r)); 282 break; 283 284 case EINVAL: 285 default: 286 bad_error("restarter_remove_contract", r); 287 } 288 289 out: 290 if (primary) 291 contract_hash_remove(*ctidp); 292 293 *ctidp = 0; 294 } 295 296 /* 297 * int method_ready_contract(restarter_inst_t *, int, method_restart_t, int) 298 * 299 * Activate a contract template for the type method of inst. type, 300 * restart_on, and cte_mask dictate the critical events term of the contract. 301 * Returns 302 * 0 - success 303 * ECANCELED - inst has been deleted from the repository 304 */ 305 static int 306 method_ready_contract(restarter_inst_t *inst, int type, 307 method_restart_t restart_on, uint_t cte_mask) 308 { 309 int tmpl, err, istrans, iswait, ret; 310 uint_t cevents, fevents; 311 312 /* 313 * Correctly supporting wait-style services is tricky without 314 * rearchitecting startd to cope with multiple event sources 315 * simultaneously trying to stop an instance. Until a better 316 * solution is implemented, we avoid this problem for 317 * wait-style services by making contract events fatal and 318 * letting the wait code alone handle stopping the service. 319 */ 320 iswait = instance_is_wait_style(inst); 321 istrans = method_is_transient(inst, type); 322 323 tmpl = open64(CTFS_ROOT "/process/template", O_RDWR); 324 if (tmpl == -1) 325 uu_die("Could not create contract template"); 326 327 /* 328 * We assume non-login processes are unlikely to create 329 * multiple process groups, and set CT_PR_PGRPONLY for all 330 * wait-style services' contracts. 331 */ 332 err = ct_pr_tmpl_set_param(tmpl, CT_PR_INHERIT | CT_PR_REGENT | 333 (iswait ? CT_PR_PGRPONLY : 0)); 334 assert(err == 0); 335 336 if (istrans) { 337 cevents = 0; 338 fevents = 0; 339 } else { 340 assert(restart_on >= 0); 341 assert(restart_on <= METHOD_RESTART_ANY_FAULT); 342 cevents = method_events[restart_on] & ~cte_mask; 343 fevents = iswait ? 344 (method_events[restart_on] & ~cte_mask & CT_PR_ALLFATAL) : 345 0; 346 } 347 348 err = ct_tmpl_set_critical(tmpl, cevents); 349 assert(err == 0); 350 351 err = ct_tmpl_set_informative(tmpl, 0); 352 assert(err == 0); 353 err = ct_pr_tmpl_set_fatal(tmpl, fevents); 354 assert(err == 0); 355 356 err = ct_tmpl_set_cookie(tmpl, istrans ? METHOD_OTHER_COOKIE : 357 METHOD_START_COOKIE); 358 assert(err == 0); 359 360 if (type == METHOD_START && inst->ri_i.i_primary_ctid != 0) { 361 ret = ct_pr_tmpl_set_transfer(tmpl, inst->ri_i.i_primary_ctid); 362 switch (ret) { 363 case 0: 364 break; 365 366 case ENOTEMPTY: 367 /* No contracts for you! */ 368 method_remove_contract(inst, B_TRUE, B_TRUE); 369 if (inst->ri_mi_deleted) { 370 ret = ECANCELED; 371 goto out; 372 } 373 break; 374 375 case EINVAL: 376 case ESRCH: 377 case EACCES: 378 default: 379 bad_error("ct_pr_tmpl_set_transfer", ret); 380 } 381 } 382 383 err = ct_tmpl_activate(tmpl); 384 assert(err == 0); 385 386 ret = 0; 387 388 out: 389 err = close(tmpl); 390 assert(err == 0); 391 392 return (ret); 393 } 394 395 static const char *method_names[] = { "start", "stop", "refresh" }; 396 397 static void 398 exec_method(const restarter_inst_t *inst, int type, const char *method, 399 struct method_context *mcp, uint8_t need_session) 400 { 401 char *cmd; 402 const char *errf; 403 char **nenv; 404 405 cmd = uu_msprintf("exec %s", method); 406 407 if (inst->ri_utmpx_prefix[0] != '\0' && inst->ri_utmpx_prefix != NULL) 408 (void) utmpx_mark_init(getpid(), inst->ri_utmpx_prefix); 409 410 setlog(inst->ri_logstem); 411 log_instance(inst, B_FALSE, "Executing %s method (\"%s\")", 412 method_names[type], method); 413 414 if (need_session) 415 (void) setpgrp(); 416 417 /* Set credentials. */ 418 errno = restarter_set_method_context(mcp, &errf); 419 if (errno != 0) { 420 (void) fputs("svc.startd could not set context for method: ", 421 stderr); 422 423 if (errno == -1) { 424 if (strcmp(errf, "core_set_process_path") == 0) { 425 (void) fputs("Could not set corefile path.\n", 426 stderr); 427 } else if (strcmp(errf, "setproject") == 0) { 428 (void) fprintf(stderr, "%s: a resource control " 429 "assignment failed\n", errf); 430 } else if (strcmp(errf, "pool_set_binding") == 0) { 431 (void) fprintf(stderr, "%s: a system error " 432 "occurred\n", errf); 433 } else { 434 #ifndef NDEBUG 435 uu_warn("%s:%d: Bad function name \"%s\" for " 436 "error %d from " 437 "restarter_set_method_context().\n", 438 __FILE__, __LINE__, errf, errno); 439 #endif 440 abort(); 441 } 442 443 exit(1); 444 } 445 446 if (errf != NULL && strcmp(errf, "pool_set_binding") == 0) { 447 switch (errno) { 448 case ENOENT: 449 (void) fprintf(stderr, "%s: the pool could not " 450 "be found\n", errf); 451 break; 452 453 case EBADF: 454 (void) fprintf(stderr, "%s: the configuration " 455 "is invalid\n", errf); 456 break; 457 458 default: 459 #ifndef NDEBUG 460 uu_warn("%s:%d: Bad error %d for function %s " 461 "in restarter_set_method_context().\n", 462 __FILE__, __LINE__, errno, errf); 463 #endif 464 abort(); 465 } 466 467 exit(SMF_EXIT_ERR_CONFIG); 468 } 469 470 if (errf != NULL) { 471 perror(errf); 472 473 switch (errno) { 474 case EINVAL: 475 case EPERM: 476 case ENOENT: 477 case ENAMETOOLONG: 478 case ERANGE: 479 case ESRCH: 480 exit(SMF_EXIT_ERR_CONFIG); 481 /* NOTREACHED */ 482 483 default: 484 exit(1); 485 } 486 } 487 488 switch (errno) { 489 case ENOMEM: 490 (void) fputs("Out of memory.\n", stderr); 491 exit(1); 492 /* NOTREACHED */ 493 494 case ENOENT: 495 (void) fputs("Missing passwd entry for user.\n", 496 stderr); 497 exit(SMF_EXIT_ERR_CONFIG); 498 /* NOTREACHED */ 499 500 default: 501 #ifndef NDEBUG 502 uu_warn("%s:%d: Bad miscellaneous error %d from " 503 "restarter_set_method_context().\n", __FILE__, 504 __LINE__, errno); 505 #endif 506 abort(); 507 } 508 } 509 510 nenv = set_smf_env(mcp->env, mcp->env_sz, NULL, inst, method); 511 512 log_preexec(); 513 514 (void) execle(SBIN_SH, SBIN_SH, "-c", cmd, NULL, nenv); 515 516 exit(10); 517 } 518 519 static void 520 write_status(restarter_inst_t *inst, const char *mname, int stat) 521 { 522 int r; 523 524 again: 525 if (inst->ri_mi_deleted) 526 return; 527 528 r = libscf_write_method_status(inst->ri_m_inst, mname, stat); 529 switch (r) { 530 case 0: 531 break; 532 533 case ECONNABORTED: 534 libscf_reget_instance(inst); 535 goto again; 536 537 case ECANCELED: 538 inst->ri_mi_deleted = 1; 539 break; 540 541 case EPERM: 542 case EACCES: 543 case EROFS: 544 log_framework(LOG_INFO, "Could not write exit status " 545 "for %s method of %s: %s.\n", mname, 546 inst->ri_i.i_fmri, strerror(r)); 547 break; 548 549 case ENAMETOOLONG: 550 default: 551 bad_error("libscf_write_method_status", r); 552 } 553 } 554 555 /* 556 * int method_run() 557 * Execute the type method of instp. If it requires a fork(), wait for it 558 * to return and return its exit code in *exit_code. Otherwise set 559 * *exit_code to 0 if the method succeeds & -1 if it fails. If the 560 * repository connection is broken, it is rebound, but inst may not be 561 * reset. 562 * Returns 563 * 0 - success 564 * EINVAL - A correct method or method context couldn't be retrieved. 565 * EIO - Contract kill failed. 566 * EFAULT - Method couldn't be executed successfully. 567 * ELOOP - Retry threshold exceeded. 568 * ECANCELED - inst was deleted from the repository before method was run 569 * ERANGE - Timeout retry threshold exceeded. 570 * EAGAIN - Failed due to external cause, retry. 571 */ 572 int 573 method_run(restarter_inst_t **instp, int type, int *exit_code) 574 { 575 char *method; 576 int ret_status; 577 pid_t pid; 578 method_restart_t restart_on; 579 uint_t cte_mask; 580 uint8_t need_session; 581 scf_handle_t *h; 582 scf_snapshot_t *snap; 583 const char *mname; 584 const char *errstr; 585 struct method_context *mcp; 586 int result = 0, timeout_fired = 0; 587 int sig, r; 588 boolean_t transient; 589 uint64_t timeout; 590 uint8_t timeout_retry; 591 ctid_t ctid; 592 int ctfd = -1; 593 ct_evthdl_t ctev; 594 uint_t evtype; 595 restarter_inst_t *inst = *instp; 596 int id = inst->ri_id; 597 int forkerr; 598 599 assert(PTHREAD_MUTEX_HELD(&inst->ri_lock)); 600 assert(instance_in_transition(inst)); 601 602 if (inst->ri_mi_deleted) 603 return (ECANCELED); 604 605 *exit_code = 0; 606 607 assert(0 <= type && type <= 2); 608 mname = method_names[type]; 609 610 if (type == METHOD_START) 611 inst->ri_pre_online_hook(); 612 613 h = scf_instance_handle(inst->ri_m_inst); 614 615 snap = scf_snapshot_create(h); 616 if (snap == NULL || 617 scf_instance_get_snapshot(inst->ri_m_inst, "running", snap) != 0) { 618 log_framework(LOG_DEBUG, 619 "Could not get running snapshot for %s. " 620 "Using editing version to run method %s.\n", 621 inst->ri_i.i_fmri, mname); 622 scf_snapshot_destroy(snap); 623 snap = NULL; 624 } 625 626 /* 627 * After this point, we may be logging to the instance log. 628 * Make sure we've noted where that log is as a property of 629 * the instance. 630 */ 631 r = libscf_note_method_log(inst->ri_m_inst, st->st_log_prefix, 632 inst->ri_logstem); 633 if (r != 0) { 634 log_framework(LOG_WARNING, 635 "%s: couldn't note log location: %s\n", 636 inst->ri_i.i_fmri, strerror(r)); 637 } 638 639 if ((method = libscf_get_method(h, type, inst, snap, &restart_on, 640 &cte_mask, &need_session, &timeout, &timeout_retry)) == NULL) { 641 if (errno == LIBSCF_PGROUP_ABSENT) { 642 log_framework(LOG_DEBUG, 643 "%s: instance has no method property group '%s'.\n", 644 inst->ri_i.i_fmri, mname); 645 if (type == METHOD_REFRESH) 646 log_instance(inst, B_TRUE, "No '%s' method " 647 "defined. Treating as :true.", mname); 648 else 649 log_instance(inst, B_TRUE, "Method property " 650 "group '%s' is not present.", mname); 651 scf_snapshot_destroy(snap); 652 return (0); 653 } else if (errno == LIBSCF_PROPERTY_ABSENT) { 654 log_framework(LOG_DEBUG, 655 "%s: instance has no '%s/exec' method property.\n", 656 inst->ri_i.i_fmri, mname); 657 log_instance(inst, B_TRUE, "Method property '%s/exec " 658 "is not present.", mname); 659 scf_snapshot_destroy(snap); 660 return (0); 661 } else { 662 log_error(LOG_WARNING, 663 "%s: instance libscf_get_method failed\n", 664 inst->ri_i.i_fmri); 665 scf_snapshot_destroy(snap); 666 return (EINVAL); 667 } 668 } 669 670 /* open service contract if stopping a non-transient service */ 671 if (type == METHOD_STOP && (!instance_is_transient_style(inst))) { 672 if (inst->ri_i.i_primary_ctid == 0) { 673 /* service is not running, nothing to stop */ 674 log_framework(LOG_DEBUG, "%s: instance has no primary " 675 "contract, no service to stop.\n", 676 inst->ri_i.i_fmri); 677 scf_snapshot_destroy(snap); 678 return (0); 679 } 680 if ((ctfd = contract_open(inst->ri_i.i_primary_ctid, "process", 681 "events", O_RDONLY)) < 0) { 682 result = EFAULT; 683 log_instance(inst, B_TRUE, "Could not open service " 684 "contract %ld. Stop method not run.\n", 685 inst->ri_i.i_primary_ctid); 686 goto out; 687 } 688 } 689 690 if (restarter_is_null_method(method)) { 691 log_framework(LOG_DEBUG, "%s: null method succeeds\n", 692 inst->ri_i.i_fmri); 693 694 log_instance(inst, B_TRUE, "Executing %s method (null)", mname); 695 696 if (type == METHOD_START) 697 write_status(inst, mname, 0); 698 goto out; 699 } 700 701 sig = restarter_is_kill_method(method); 702 if (sig >= 0) { 703 704 if (inst->ri_i.i_primary_ctid == 0) { 705 log_error(LOG_ERR, "%s: :kill with no contract\n", 706 inst->ri_i.i_fmri); 707 result = EINVAL; 708 goto out; 709 } 710 711 log_framework(LOG_DEBUG, 712 "%s: :killing contract with signal %d\n", 713 inst->ri_i.i_fmri, sig); 714 715 log_instance(inst, B_TRUE, "Executing %s method (:kill)", 716 mname); 717 718 if (contract_kill(inst->ri_i.i_primary_ctid, sig, 719 inst->ri_i.i_fmri) != 0) { 720 result = EIO; 721 goto out; 722 } else 723 goto assured_kill; 724 } 725 726 log_framework(LOG_DEBUG, "%s: forking to run method %s\n", 727 inst->ri_i.i_fmri, method); 728 729 errstr = restarter_get_method_context(RESTARTER_METHOD_CONTEXT_VERSION, 730 inst->ri_m_inst, snap, mname, method, &mcp); 731 732 if (errstr != NULL) { 733 log_error(LOG_WARNING, "%s: %s\n", inst->ri_i.i_fmri, errstr); 734 result = EINVAL; 735 goto out; 736 } 737 738 r = method_ready_contract(inst, type, restart_on, cte_mask); 739 if (r != 0) { 740 assert(r == ECANCELED); 741 assert(inst->ri_mi_deleted); 742 restarter_free_method_context(mcp); 743 result = ECANCELED; 744 goto out; 745 } 746 747 /* 748 * Validate safety of method contexts, to save children work. 749 */ 750 if (!restarter_rm_libs_loadable()) 751 log_framework(LOG_DEBUG, "%s: method contexts limited " 752 "to root-accessible libraries\n", inst->ri_i.i_fmri); 753 754 /* 755 * If the service is restarting too quickly, send it to 756 * maintenance. 757 */ 758 if (type == METHOD_START) { 759 method_record_start(inst); 760 if (method_rate_critical(inst)) { 761 log_instance(inst, B_TRUE, "Restarting too quickly, " 762 "changing state to maintenance"); 763 result = ELOOP; 764 goto out; 765 } 766 } 767 768 pid = startd_fork1(&forkerr); 769 if (pid == 0) 770 exec_method(inst, type, method, mcp, need_session); 771 772 if (pid == -1) { 773 if (forkerr == EAGAIN) 774 result = EAGAIN; 775 else 776 result = EFAULT; 777 778 log_error(LOG_WARNING, 779 "%s: Couldn't fork to execute method %s: %s\n", 780 inst->ri_i.i_fmri, method, strerror(forkerr)); 781 782 goto out; 783 } 784 785 restarter_free_method_context(mcp); 786 787 /* 788 * Get the contract id, decide whether it is primary or transient, and 789 * stash it in inst & the repository. 790 */ 791 method_store_contract(inst, type, &ctid); 792 793 /* 794 * Similarly for the start method PID. 795 */ 796 if (type == METHOD_START && !inst->ri_mi_deleted) 797 (void) libscf_write_start_pid(inst->ri_m_inst, pid); 798 799 if (instance_is_wait_style(inst) && type == METHOD_START) { 800 /* Wait style instances don't get timeouts on start methods. */ 801 if (wait_register(pid, inst->ri_i.i_fmri, 1, 0)) { 802 log_error(LOG_WARNING, 803 "%s: couldn't register %ld for wait\n", 804 inst->ri_i.i_fmri, pid); 805 result = EFAULT; 806 goto contract_out; 807 } 808 write_status(inst, mname, 0); 809 810 } else { 811 int r, err; 812 time_t start_time; 813 time_t end_time; 814 815 /* 816 * Because on upgrade/live-upgrade we may have no chance 817 * to override faulty timeout values on the way to 818 * manifest import, all services on the path to manifest 819 * import are treated the same as INFINITE timeout services. 820 */ 821 822 start_time = time(NULL); 823 if (timeout != METHOD_TIMEOUT_INFINITE && !is_timeout_ovr(inst)) 824 timeout_insert(inst, ctid, timeout); 825 else 826 timeout = METHOD_TIMEOUT_INFINITE; 827 828 /* Unlock the instance while waiting for the method. */ 829 MUTEX_UNLOCK(&inst->ri_lock); 830 831 do 832 r = waitpid(pid, &ret_status, NULL); 833 while (r == -1 && errno == EINTR); 834 if (r == -1) 835 err = errno; 836 837 /* Re-grab the lock. */ 838 inst = inst_lookup_by_id(id); 839 840 /* 841 * inst can't be removed, as the removal thread waits 842 * for completion of this one. 843 */ 844 assert(inst != NULL); 845 *instp = inst; 846 847 if (inst->ri_timeout != NULL && inst->ri_timeout->te_fired) 848 timeout_fired = 1; 849 850 timeout_remove(inst, ctid); 851 852 log_framework(LOG_DEBUG, 853 "%s method for %s exited with status %d.\n", mname, 854 inst->ri_i.i_fmri, WEXITSTATUS(ret_status)); 855 856 if (r == -1) { 857 log_error(LOG_WARNING, 858 "Couldn't waitpid() for %s method of %s (%s).\n", 859 mname, inst->ri_i.i_fmri, strerror(err)); 860 result = EFAULT; 861 goto contract_out; 862 } 863 864 if (type == METHOD_START) 865 write_status(inst, mname, ret_status); 866 867 /* return ERANGE if this service doesn't retry on timeout */ 868 if (timeout_fired == 1 && timeout_retry == 0) { 869 result = ERANGE; 870 goto contract_out; 871 } 872 873 if (!WIFEXITED(ret_status)) { 874 /* 875 * If method didn't exit itself (it was killed by an 876 * external entity, etc.), consider the entire 877 * method_run as failed. 878 */ 879 if (WIFSIGNALED(ret_status)) { 880 char buf[SIG2STR_MAX]; 881 (void) sig2str(WTERMSIG(ret_status), buf); 882 883 log_error(LOG_WARNING, "%s: Method \"%s\" " 884 "failed due to signal %s.\n", 885 inst->ri_i.i_fmri, method, buf); 886 log_instance(inst, B_TRUE, "Method \"%s\" " 887 "failed due to signal %s", mname, buf); 888 } else { 889 log_error(LOG_WARNING, "%s: Method \"%s\" " 890 "failed with exit status %d.\n", 891 inst->ri_i.i_fmri, method, 892 WEXITSTATUS(ret_status)); 893 log_instance(inst, B_TRUE, "Method \"%s\" " 894 "failed with exit status %d", mname, 895 WEXITSTATUS(ret_status)); 896 } 897 result = EAGAIN; 898 goto contract_out; 899 } 900 901 *exit_code = WEXITSTATUS(ret_status); 902 if (*exit_code != 0) { 903 log_error(LOG_WARNING, 904 "%s: Method \"%s\" failed with exit status %d.\n", 905 inst->ri_i.i_fmri, method, WEXITSTATUS(ret_status)); 906 } 907 908 log_instance(inst, B_TRUE, "Method \"%s\" exited with status " 909 "%d", mname, *exit_code); 910 911 if (*exit_code != 0) 912 goto contract_out; 913 914 end_time = time(NULL); 915 916 /* Give service contract remaining seconds to empty */ 917 if (timeout != METHOD_TIMEOUT_INFINITE) 918 timeout -= (end_time - start_time); 919 } 920 921 assured_kill: 922 /* 923 * For stop methods, assure that the service contract has emptied 924 * before returning. 925 */ 926 if (type == METHOD_STOP && (!instance_is_transient_style(inst)) && 927 !(contract_is_empty(inst->ri_i.i_primary_ctid))) { 928 929 if (timeout != METHOD_TIMEOUT_INFINITE) 930 timeout_insert(inst, inst->ri_i.i_primary_ctid, 931 timeout); 932 933 for (;;) { 934 do { 935 r = ct_event_read_critical(ctfd, &ctev); 936 } while (r == EINTR); 937 if (r != 0) 938 break; 939 940 evtype = ct_event_get_type(ctev); 941 ct_event_free(ctev); 942 if (evtype == CT_PR_EV_EMPTY) 943 break; 944 } 945 if (r) { 946 result = EFAULT; 947 log_instance(inst, B_TRUE, "Error reading service " 948 "contract %ld.\n", inst->ri_i.i_primary_ctid); 949 } 950 951 if (timeout != METHOD_TIMEOUT_INFINITE) 952 if (inst->ri_timeout->te_fired) 953 result = EFAULT; 954 955 timeout_remove(inst, inst->ri_i.i_primary_ctid); 956 } 957 958 contract_out: 959 /* Abandon contracts for transient methods & methods that fail. */ 960 transient = method_is_transient(inst, type); 961 if ((transient || *exit_code != 0 || result != 0) && 962 (restarter_is_kill_method(method) < 0)) 963 method_remove_contract(inst, !transient, B_TRUE); 964 965 out: 966 if (ctfd >= 0) 967 (void) close(ctfd); 968 scf_snapshot_destroy(snap); 969 free(method); 970 return (result); 971 } 972 973 /* 974 * The method thread executes a service method to effect a state transition. 975 * The next_state of info->sf_id should be non-_NONE on entrance, and it will 976 * be _NONE on exit (state will either be what next_state was (on success), or 977 * it will be _MAINT (on error)). 978 * 979 * There are six classes of methods to consider: start & other (stop, refresh) 980 * for each of "normal" services, wait services, and transient services. For 981 * each, the method must be fetched from the repository & executed. fork()ed 982 * methods must be waited on, except for the start method of wait services 983 * (which must be registered with the wait subsystem via wait_register()). If 984 * the method succeeded (returned 0), then for start methods its contract 985 * should be recorded as the primary contract for the service. For other 986 * methods, it should be abandoned. If the method fails, then depending on 987 * the failure, either the method should be reexecuted or the service should 988 * be put into maintenance. Either way the contract should be abandoned. 989 */ 990 void * 991 method_thread(void *arg) 992 { 993 fork_info_t *info = arg; 994 restarter_inst_t *inst; 995 scf_handle_t *local_handle; 996 scf_instance_t *s_inst = NULL; 997 int r, exit_code; 998 boolean_t retryable; 999 const char *aux; 1000 1001 assert(0 <= info->sf_method_type && info->sf_method_type <= 2); 1002 1003 /* Get (and lock) the restarter_inst_t. */ 1004 inst = inst_lookup_by_id(info->sf_id); 1005 1006 assert(inst->ri_method_thread != 0); 1007 assert(instance_in_transition(inst) == 1); 1008 1009 /* 1010 * We cannot leave this function with inst in transition, because 1011 * protocol.c withholds messages for inst otherwise. 1012 */ 1013 1014 log_framework(LOG_DEBUG, "method_thread() running %s method for %s.\n", 1015 method_names[info->sf_method_type], inst->ri_i.i_fmri); 1016 1017 local_handle = libscf_handle_create_bound_loop(); 1018 1019 rebind_retry: 1020 /* get scf_instance_t */ 1021 switch (r = libscf_fmri_get_instance(local_handle, inst->ri_i.i_fmri, 1022 &s_inst)) { 1023 case 0: 1024 break; 1025 1026 case ECONNABORTED: 1027 libscf_handle_rebind(local_handle); 1028 goto rebind_retry; 1029 1030 case ENOENT: 1031 /* 1032 * It's not there, but we need to call this so protocol.c 1033 * doesn't think it's in transition anymore. 1034 */ 1035 (void) restarter_instance_update_states(local_handle, inst, 1036 inst->ri_i.i_state, RESTARTER_STATE_NONE, RERR_NONE, 1037 NULL); 1038 goto out; 1039 1040 case EINVAL: 1041 case ENOTSUP: 1042 default: 1043 bad_error("libscf_fmri_get_instance", r); 1044 } 1045 1046 inst->ri_m_inst = s_inst; 1047 inst->ri_mi_deleted = B_FALSE; 1048 1049 retry: 1050 if (info->sf_method_type == METHOD_START) 1051 log_transition(inst, START_REQUESTED); 1052 1053 r = method_run(&inst, info->sf_method_type, &exit_code); 1054 1055 if (r == 0 && exit_code == 0) { 1056 /* Success! */ 1057 assert(inst->ri_i.i_next_state != RESTARTER_STATE_NONE); 1058 1059 /* 1060 * When a stop method succeeds, remove the primary contract of 1061 * the service, unless we're going to offline, in which case 1062 * retain the contract so we can transfer inherited contracts to 1063 * the replacement service. 1064 */ 1065 1066 if (info->sf_method_type == METHOD_STOP && 1067 inst->ri_i.i_primary_ctid != 0) { 1068 if (inst->ri_i.i_next_state == RESTARTER_STATE_OFFLINE) 1069 inst->ri_i.i_primary_ctid_stopped = 1; 1070 else 1071 method_remove_contract(inst, B_TRUE, B_TRUE); 1072 } 1073 /* 1074 * We don't care whether the handle was rebound because this is 1075 * the last thing we do with it. 1076 */ 1077 (void) restarter_instance_update_states(local_handle, inst, 1078 inst->ri_i.i_next_state, RESTARTER_STATE_NONE, 1079 info->sf_event_type, NULL); 1080 1081 (void) update_fault_count(inst, FAULT_COUNT_RESET); 1082 1083 goto out; 1084 } 1085 1086 /* Failure. Retry or go to maintenance. */ 1087 1088 if (r != 0 && r != EAGAIN) { 1089 retryable = B_FALSE; 1090 } else { 1091 switch (exit_code) { 1092 case SMF_EXIT_ERR_CONFIG: 1093 case SMF_EXIT_ERR_NOSMF: 1094 case SMF_EXIT_ERR_PERM: 1095 case SMF_EXIT_ERR_FATAL: 1096 retryable = B_FALSE; 1097 break; 1098 1099 default: 1100 retryable = B_TRUE; 1101 } 1102 } 1103 1104 if (retryable && update_fault_count(inst, FAULT_COUNT_INCR) != 1) 1105 goto retry; 1106 1107 /* maintenance */ 1108 if (r == ELOOP) 1109 log_transition(inst, START_FAILED_REPEATEDLY); 1110 else if (r == ERANGE) 1111 log_transition(inst, START_FAILED_TIMEOUT_FATAL); 1112 else if (exit_code == SMF_EXIT_ERR_CONFIG) 1113 log_transition(inst, START_FAILED_CONFIGURATION); 1114 else if (exit_code == SMF_EXIT_ERR_FATAL) 1115 log_transition(inst, START_FAILED_FATAL); 1116 else 1117 log_transition(inst, START_FAILED_OTHER); 1118 1119 if (r == ELOOP) 1120 aux = "restarting_too_quickly"; 1121 else if (retryable) 1122 aux = "fault_threshold_reached"; 1123 else 1124 aux = "method_failed"; 1125 1126 (void) restarter_instance_update_states(local_handle, inst, 1127 RESTARTER_STATE_MAINT, RESTARTER_STATE_NONE, RERR_FAULT, 1128 (char *)aux); 1129 1130 if (!method_is_transient(inst, info->sf_method_type) && 1131 inst->ri_i.i_primary_ctid != 0) 1132 method_remove_contract(inst, B_TRUE, B_TRUE); 1133 1134 out: 1135 inst->ri_method_thread = 0; 1136 MUTEX_UNLOCK(&inst->ri_lock); 1137 (void) pthread_cond_broadcast(&inst->ri_method_cv); 1138 1139 scf_instance_destroy(s_inst); 1140 scf_handle_destroy(local_handle); 1141 startd_free(info, sizeof (fork_info_t)); 1142 return (NULL); 1143 } 1144