1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * restarter.c - service manipulation 31 * 32 * This component manages services whose restarter is svc.startd, the standard 33 * restarter. It translates restarter protocol events from the graph engine 34 * into actions on processes, as a delegated restarter would do. 35 * 36 * The master restarter manages a number of always-running threads: 37 * - restarter event thread: events from the graph engine 38 * - timeout thread: thread to fire queued timeouts 39 * - contract thread: thread to handle contract events 40 * - wait thread: thread to handle wait-based services 41 * 42 * The other threads are created as-needed: 43 * - per-instance method threads 44 * - per-instance event processing threads 45 * 46 * The interaction of all threads must result in the following conditions 47 * being satisfied (on a per-instance basis): 48 * - restarter events must be processed in order 49 * - method execution must be serialized 50 * - instance delete must be held until outstanding methods are complete 51 * - contract events shouldn't be processed while a method is running 52 * - timeouts should fire even when a method is running 53 * 54 * Service instances are represented by restarter_inst_t's and are kept in the 55 * instance_list list. 56 * 57 * Service States 58 * The current state of a service instance is kept in 59 * restarter_inst_t->ri_i.i_state. If transition to a new state could take 60 * some time, then before we effect the transition we set 61 * restarter_inst_t->ri_i.i_next_state to the target state, and afterwards we 62 * rotate i_next_state to i_state and set i_next_state to 63 * RESTARTER_STATE_NONE. So usually i_next_state is _NONE when ri_lock is not 64 * held. The exception is when we launch methods, which are done with 65 * a separate thread. To keep any other threads from grabbing ri_lock before 66 * method_thread() does, we set ri_method_thread to the thread id of the 67 * method thread, and when it is nonzero any thread with a different thread id 68 * waits on ri_method_cv. 69 * 70 * Method execution is serialized by blocking on ri_method_cv in 71 * inst_lookup_by_id() and waiting for a 0 value of ri_method_thread. This 72 * also prevents the instance structure from being deleted until all 73 * outstanding operations such as method_thread() have finished. 74 * 75 * Lock ordering: 76 * 77 * dgraph_lock [can be held when taking:] 78 * utmpx_lock 79 * dictionary->dict_lock 80 * st->st_load_lock 81 * wait_info_lock 82 * ru->restarter_update_lock 83 * restarter_queue->rpeq_lock 84 * instance_list.ril_lock 85 * inst->ri_lock 86 * st->st_configd_live_lock 87 * 88 * instance_list.ril_lock 89 * graph_queue->gpeq_lock 90 * gu->gu_lock 91 * st->st_configd_live_lock 92 * dictionary->dict_lock 93 * inst->ri_lock 94 * graph_queue->gpeq_lock 95 * gu->gu_lock 96 * tu->tu_lock 97 * tq->tq_lock 98 * inst->ri_queue_lock 99 * wait_info_lock 100 * bp->cb_lock 101 * utmpx_lock 102 * 103 * single_user_thread_lock 104 * wait_info_lock 105 * utmpx_lock 106 * 107 * gu_freeze_lock 108 * 109 * logbuf_mutex nests inside pretty much everything. 110 */ 111 112 #include <sys/contract/process.h> 113 #include <sys/ctfs.h> 114 #include <sys/stat.h> 115 #include <sys/time.h> 116 #include <sys/types.h> 117 #include <sys/uio.h> 118 #include <sys/wait.h> 119 #include <assert.h> 120 #include <errno.h> 121 #include <fcntl.h> 122 #include <libcontract.h> 123 #include <libcontract_priv.h> 124 #include <libintl.h> 125 #include <librestart.h> 126 #include <librestart_priv.h> 127 #include <libuutil.h> 128 #include <limits.h> 129 #include <poll.h> 130 #include <port.h> 131 #include <pthread.h> 132 #include <stdarg.h> 133 #include <stdio.h> 134 #include <strings.h> 135 #include <unistd.h> 136 137 #include "startd.h" 138 #include "protocol.h" 139 140 static uu_list_pool_t *restarter_instance_pool; 141 static restarter_instance_list_t instance_list; 142 143 static uu_list_pool_t *restarter_queue_pool; 144 145 /*ARGSUSED*/ 146 static int 147 restarter_instance_compare(const void *lc_arg, const void *rc_arg, 148 void *private) 149 { 150 int lc_id = ((const restarter_inst_t *)lc_arg)->ri_id; 151 int rc_id = *(int *)rc_arg; 152 153 if (lc_id > rc_id) 154 return (1); 155 if (lc_id < rc_id) 156 return (-1); 157 return (0); 158 } 159 160 static restarter_inst_t * 161 inst_lookup_by_name(const char *name) 162 { 163 int id; 164 165 id = dict_lookup_byname(name); 166 if (id == -1) 167 return (NULL); 168 169 return (inst_lookup_by_id(id)); 170 } 171 172 restarter_inst_t * 173 inst_lookup_by_id(int id) 174 { 175 restarter_inst_t *inst; 176 177 MUTEX_LOCK(&instance_list.ril_lock); 178 inst = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL); 179 if (inst != NULL) 180 MUTEX_LOCK(&inst->ri_lock); 181 MUTEX_UNLOCK(&instance_list.ril_lock); 182 183 if (inst != NULL) { 184 while (inst->ri_method_thread != 0 && 185 !pthread_equal(inst->ri_method_thread, pthread_self())) { 186 ++inst->ri_method_waiters; 187 (void) pthread_cond_wait(&inst->ri_method_cv, 188 &inst->ri_lock); 189 assert(inst->ri_method_waiters > 0); 190 --inst->ri_method_waiters; 191 } 192 } 193 194 return (inst); 195 } 196 197 static restarter_inst_t * 198 inst_lookup_queue(const char *name) 199 { 200 int id; 201 restarter_inst_t *inst; 202 203 id = dict_lookup_byname(name); 204 if (id == -1) 205 return (NULL); 206 207 MUTEX_LOCK(&instance_list.ril_lock); 208 inst = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL); 209 if (inst != NULL) 210 MUTEX_LOCK(&inst->ri_queue_lock); 211 MUTEX_UNLOCK(&instance_list.ril_lock); 212 213 return (inst); 214 } 215 216 const char * 217 service_style(int flags) 218 { 219 switch (flags & RINST_STYLE_MASK) { 220 case RINST_CONTRACT: return ("contract"); 221 case RINST_TRANSIENT: return ("transient"); 222 case RINST_WAIT: return ("wait"); 223 224 default: 225 #ifndef NDEBUG 226 uu_warn("%s:%d: Bad flags 0x%x.\n", __FILE__, __LINE__, flags); 227 #endif 228 abort(); 229 /* NOTREACHED */ 230 } 231 } 232 233 /* 234 * Fails with ECONNABORTED or ECANCELED. 235 */ 236 static int 237 check_contract(restarter_inst_t *inst, boolean_t primary, 238 scf_instance_t *scf_inst) 239 { 240 ctid_t *ctidp; 241 int fd, r; 242 243 ctidp = primary ? &inst->ri_i.i_primary_ctid : 244 &inst->ri_i.i_transient_ctid; 245 246 assert(*ctidp >= 1); 247 248 fd = contract_open(*ctidp, NULL, "status", O_RDONLY); 249 if (fd >= 0) { 250 r = close(fd); 251 assert(r == 0); 252 return (0); 253 } 254 255 r = restarter_remove_contract(scf_inst, *ctidp, primary ? 256 RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT); 257 switch (r) { 258 case 0: 259 case ECONNABORTED: 260 case ECANCELED: 261 *ctidp = 0; 262 return (r); 263 264 case ENOMEM: 265 uu_die("Out of memory\n"); 266 /* NOTREACHED */ 267 268 case EPERM: 269 uu_die("Insufficient privilege.\n"); 270 /* NOTREACHED */ 271 272 case EACCES: 273 uu_die("Repository backend access denied.\n"); 274 /* NOTREACHED */ 275 276 case EROFS: 277 log_error(LOG_INFO, "Could not remove unusable contract id %ld " 278 "for %s from repository.\n", *ctidp, inst->ri_i.i_fmri); 279 return (0); 280 281 case EINVAL: 282 case EBADF: 283 default: 284 assert(0); 285 abort(); 286 /* NOTREACHED */ 287 } 288 } 289 290 static int stop_instance(scf_handle_t *, restarter_inst_t *, stop_cause_t); 291 292 /* 293 * int restarter_insert_inst(scf_handle_t *, char *) 294 * If the inst is already in the restarter list, return its id. If the inst 295 * is not in the restarter list, initialize a restarter_inst_t, initialize its 296 * states, insert it into the list, and return 0. 297 * 298 * Fails with 299 * ENOENT - name is not in the repository 300 */ 301 static int 302 restarter_insert_inst(scf_handle_t *h, const char *name) 303 { 304 int id, r; 305 restarter_inst_t *inst; 306 uu_list_index_t idx; 307 scf_service_t *scf_svc; 308 scf_instance_t *scf_inst; 309 scf_snapshot_t *snap; 310 scf_propertygroup_t *pg; 311 char *svc_name, *inst_name; 312 char logfilebuf[PATH_MAX]; 313 char *c; 314 boolean_t do_commit_states; 315 restarter_instance_state_t state, next_state; 316 protocol_states_t *ps; 317 pid_t start_pid; 318 319 MUTEX_LOCK(&instance_list.ril_lock); 320 321 /* 322 * We don't use inst_lookup_by_name() here because we want the lookup 323 * & insert to be atomic. 324 */ 325 id = dict_lookup_byname(name); 326 if (id != -1) { 327 inst = uu_list_find(instance_list.ril_instance_list, &id, NULL, 328 &idx); 329 if (inst != NULL) { 330 MUTEX_UNLOCK(&instance_list.ril_lock); 331 return (0); 332 } 333 } 334 335 /* Allocate an instance */ 336 inst = startd_zalloc(sizeof (restarter_inst_t)); 337 inst->ri_utmpx_prefix = startd_alloc(max_scf_value_size); 338 inst->ri_utmpx_prefix[0] = '\0'; 339 340 inst->ri_i.i_fmri = startd_alloc(strlen(name) + 1); 341 (void) strcpy((char *)inst->ri_i.i_fmri, name); 342 343 inst->ri_queue = startd_list_create(restarter_queue_pool, inst, 0); 344 345 /* 346 * id shouldn't be -1 since we use the same dictionary as graph.c, but 347 * just in case. 348 */ 349 inst->ri_id = (id != -1 ? id : dict_insert(name)); 350 351 special_online_hooks_get(name, &inst->ri_pre_online_hook, 352 &inst->ri_post_online_hook, &inst->ri_post_offline_hook); 353 354 scf_svc = safe_scf_service_create(h); 355 scf_inst = safe_scf_instance_create(h); 356 pg = safe_scf_pg_create(h); 357 svc_name = startd_alloc(max_scf_name_size); 358 inst_name = startd_alloc(max_scf_name_size); 359 360 rep_retry: 361 if (scf_handle_decode_fmri(h, name, NULL, scf_svc, scf_inst, NULL, 362 NULL, SCF_DECODE_FMRI_EXACT) != 0) { 363 switch (scf_error()) { 364 case SCF_ERROR_CONNECTION_BROKEN: 365 libscf_handle_rebind(h); 366 goto rep_retry; 367 368 case SCF_ERROR_NOT_FOUND: 369 deleted: 370 MUTEX_UNLOCK(&instance_list.ril_lock); 371 startd_free(inst_name, max_scf_name_size); 372 startd_free(svc_name, max_scf_name_size); 373 scf_pg_destroy(pg); 374 scf_instance_destroy(scf_inst); 375 scf_service_destroy(scf_svc); 376 startd_free((void *)inst->ri_i.i_fmri, 377 strlen(inst->ri_i.i_fmri) + 1); 378 startd_free(inst, sizeof (restarter_inst_t)); 379 return (ENOENT); 380 } 381 382 uu_die("Can't decode FMRI %s: %s\n", name, 383 scf_strerror(scf_error())); 384 } 385 386 /* 387 * If there's no running snapshot, then we execute using the editing 388 * snapshot. Pending snapshots will be taken later. 389 */ 390 snap = libscf_get_running_snapshot(scf_inst); 391 392 if ((scf_service_get_name(scf_svc, svc_name, max_scf_name_size) < 0) || 393 (scf_instance_get_name(scf_inst, inst_name, max_scf_name_size) < 394 0)) { 395 switch (scf_error()) { 396 case SCF_ERROR_NOT_SET: 397 break; 398 399 case SCF_ERROR_CONNECTION_BROKEN: 400 libscf_handle_rebind(h); 401 goto rep_retry; 402 403 default: 404 assert(0); 405 abort(); 406 } 407 408 scf_snapshot_destroy(snap); 409 goto deleted; 410 } 411 412 (void) snprintf(logfilebuf, PATH_MAX, "%s:%s", svc_name, inst_name); 413 for (c = logfilebuf; *c != '\0'; c++) 414 if (*c == '/') 415 *c = '-'; 416 417 inst->ri_logstem = startd_alloc(PATH_MAX); 418 (void) snprintf(inst->ri_logstem, PATH_MAX, "%s%s", logfilebuf, 419 LOG_SUFFIX); 420 421 /* 422 * If the restarter group is missing, use uninit/none. Otherwise, 423 * we're probably being restarted & don't want to mess up the states 424 * that are there. 425 */ 426 state = RESTARTER_STATE_UNINIT; 427 next_state = RESTARTER_STATE_NONE; 428 429 r = scf_instance_get_pg(scf_inst, SCF_PG_RESTARTER, pg); 430 if (r != 0) { 431 switch (scf_error()) { 432 case SCF_ERROR_CONNECTION_BROKEN: 433 libscf_handle_rebind(h); 434 goto rep_retry; 435 436 case SCF_ERROR_NOT_SET: 437 scf_snapshot_destroy(snap); 438 goto deleted; 439 440 case SCF_ERROR_NOT_FOUND: 441 /* 442 * This shouldn't happen since the graph engine should 443 * have initialized the state to uninitialized/none if 444 * there was no restarter pg. In case somebody 445 * deleted it, though.... 446 */ 447 do_commit_states = B_TRUE; 448 break; 449 450 default: 451 assert(0); 452 abort(); 453 } 454 } else { 455 r = libscf_read_states(pg, &state, &next_state); 456 if (r != 0) { 457 do_commit_states = B_TRUE; 458 } else { 459 if (next_state != RESTARTER_STATE_NONE) { 460 /* 461 * Force next_state to _NONE since we 462 * don't look for method processes. 463 */ 464 next_state = RESTARTER_STATE_NONE; 465 do_commit_states = B_TRUE; 466 } else { 467 /* 468 * Inform the restarter of our state without 469 * changing the STIME in the repository. 470 */ 471 ps = startd_alloc(sizeof (*ps)); 472 inst->ri_i.i_state = ps->ps_state = state; 473 inst->ri_i.i_next_state = ps->ps_state_next = 474 next_state; 475 476 graph_protocol_send_event(inst->ri_i.i_fmri, 477 GRAPH_UPDATE_STATE_CHANGE, ps); 478 479 do_commit_states = B_FALSE; 480 } 481 } 482 } 483 484 switch (libscf_get_startd_properties(scf_inst, snap, &inst->ri_flags, 485 &inst->ri_utmpx_prefix)) { 486 case 0: 487 break; 488 489 case ECONNABORTED: 490 libscf_handle_rebind(h); 491 goto rep_retry; 492 493 case ECANCELED: 494 scf_snapshot_destroy(snap); 495 startd_free(inst->ri_utmpx_prefix, max_scf_value_size); 496 goto deleted; 497 498 case ENOENT: 499 /* 500 * This is odd, because the graph engine should have required 501 * the general property group. So we'll just use default 502 * flags in anticipation of the graph engine sending us 503 * REMOVE_INSTANCE when it finds out that the general property 504 * group has been deleted. 505 */ 506 inst->ri_flags = RINST_CONTRACT; 507 break; 508 509 default: 510 assert(0); 511 abort(); 512 } 513 514 switch (libscf_get_template_values(scf_inst, snap, 515 &inst->ri_common_name, &inst->ri_C_common_name)) { 516 case 0: 517 break; 518 519 case ECONNABORTED: 520 libscf_handle_rebind(h); 521 goto rep_retry; 522 523 case ECANCELED: 524 scf_snapshot_destroy(snap); 525 startd_free(inst->ri_common_name, max_scf_value_size); 526 inst->ri_common_name = NULL; 527 goto deleted; 528 529 case ECHILD: 530 case ENOENT: 531 break; 532 533 default: 534 assert(0); 535 abort(); 536 } 537 538 switch (libscf_read_method_ids(h, scf_inst, inst->ri_i.i_fmri, 539 &inst->ri_i.i_primary_ctid, &inst->ri_i.i_transient_ctid, 540 &start_pid)) { 541 case 0: 542 break; 543 544 case ECONNABORTED: 545 libscf_handle_rebind(h); 546 goto rep_retry; 547 548 case ECANCELED: 549 scf_snapshot_destroy(snap); 550 goto deleted; 551 552 default: 553 assert(0); 554 abort(); 555 } 556 557 if (inst->ri_i.i_primary_ctid >= 1) { 558 contract_hash_store(inst->ri_i.i_primary_ctid, inst->ri_id); 559 560 switch (check_contract(inst, B_TRUE, scf_inst)) { 561 case 0: 562 break; 563 564 case ECONNABORTED: 565 libscf_handle_rebind(h); 566 goto rep_retry; 567 568 case ECANCELED: 569 scf_snapshot_destroy(snap); 570 goto deleted; 571 572 default: 573 assert(0); 574 abort(); 575 } 576 } 577 578 if (inst->ri_i.i_transient_ctid >= 1) { 579 switch (check_contract(inst, B_FALSE, scf_inst)) { 580 case 0: 581 break; 582 583 case ECONNABORTED: 584 libscf_handle_rebind(h); 585 goto rep_retry; 586 587 case ECANCELED: 588 scf_snapshot_destroy(snap); 589 goto deleted; 590 591 default: 592 assert(0); 593 abort(); 594 } 595 } 596 597 /* No more failures we live through, so add it to the list. */ 598 (void) pthread_mutex_init(&inst->ri_lock, &mutex_attrs); 599 (void) pthread_mutex_init(&inst->ri_queue_lock, &mutex_attrs); 600 MUTEX_LOCK(&inst->ri_lock); 601 MUTEX_LOCK(&inst->ri_queue_lock); 602 603 (void) pthread_cond_init(&inst->ri_method_cv, NULL); 604 605 uu_list_node_init(inst, &inst->ri_link, restarter_instance_pool); 606 uu_list_insert(instance_list.ril_instance_list, inst, idx); 607 MUTEX_UNLOCK(&instance_list.ril_lock); 608 609 if (start_pid != -1 && 610 (inst->ri_flags & RINST_STYLE_MASK) == RINST_WAIT) { 611 int ret; 612 ret = wait_register(start_pid, inst->ri_i.i_fmri, 0, 1); 613 if (ret == -1) { 614 /* 615 * Implication: if we can't reregister the 616 * instance, we will start another one. Two 617 * instances may or may not result in a resource 618 * conflict. 619 */ 620 log_error(LOG_WARNING, 621 "%s: couldn't reregister %ld for wait\n", 622 inst->ri_i.i_fmri, start_pid); 623 } else if (ret == 1) { 624 /* 625 * Leading PID has exited. 626 */ 627 (void) stop_instance(h, inst, RSTOP_EXIT); 628 } 629 } 630 631 632 scf_pg_destroy(pg); 633 634 if (do_commit_states) 635 (void) restarter_instance_update_states(h, inst, state, 636 next_state, RERR_NONE, NULL); 637 638 log_framework(LOG_DEBUG, "%s is a %s-style service\n", name, 639 service_style(inst->ri_flags)); 640 641 MUTEX_UNLOCK(&inst->ri_queue_lock); 642 MUTEX_UNLOCK(&inst->ri_lock); 643 644 startd_free(svc_name, max_scf_name_size); 645 startd_free(inst_name, max_scf_name_size); 646 scf_snapshot_destroy(snap); 647 scf_instance_destroy(scf_inst); 648 scf_service_destroy(scf_svc); 649 650 log_framework(LOG_DEBUG, "%s: inserted instance into restarter list\n", 651 name); 652 653 return (0); 654 } 655 656 static void 657 restarter_delete_inst(restarter_inst_t *ri) 658 { 659 int id; 660 restarter_inst_t *rip; 661 void *cookie = NULL; 662 restarter_instance_qentry_t *e; 663 664 assert(PTHREAD_MUTEX_HELD(&ri->ri_lock)); 665 666 /* 667 * Must drop the instance lock so we can pick up the instance_list 668 * lock & remove the instance. 669 */ 670 id = ri->ri_id; 671 MUTEX_UNLOCK(&ri->ri_lock); 672 673 MUTEX_LOCK(&instance_list.ril_lock); 674 675 rip = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL); 676 if (rip == NULL) { 677 MUTEX_UNLOCK(&instance_list.ril_lock); 678 return; 679 } 680 681 assert(ri == rip); 682 683 uu_list_remove(instance_list.ril_instance_list, ri); 684 685 log_framework(LOG_DEBUG, "%s: deleted instance from restarter list\n", 686 ri->ri_i.i_fmri); 687 688 MUTEX_UNLOCK(&instance_list.ril_lock); 689 690 /* 691 * We can lock the instance without holding the instance_list lock 692 * since we removed the instance from the list. 693 */ 694 MUTEX_LOCK(&ri->ri_lock); 695 MUTEX_LOCK(&ri->ri_queue_lock); 696 697 if (ri->ri_i.i_primary_ctid >= 1) 698 contract_hash_remove(ri->ri_i.i_primary_ctid); 699 700 while (ri->ri_method_thread != 0 || ri->ri_method_waiters > 0) 701 (void) pthread_cond_wait(&ri->ri_method_cv, &ri->ri_lock); 702 703 while ((e = uu_list_teardown(ri->ri_queue, &cookie)) != NULL) 704 startd_free(e, sizeof (*e)); 705 uu_list_destroy(ri->ri_queue); 706 707 startd_free((void *)ri->ri_i.i_fmri, strlen(ri->ri_i.i_fmri) + 1); 708 startd_free(ri->ri_logstem, PATH_MAX); 709 startd_free(ri->ri_utmpx_prefix, max_scf_value_size); 710 (void) pthread_mutex_destroy(&ri->ri_lock); 711 (void) pthread_mutex_destroy(&ri->ri_queue_lock); 712 startd_free(ri, sizeof (restarter_inst_t)); 713 } 714 715 /* 716 * instance_is_wait_style() 717 * 718 * Returns 1 if the given instance is a "wait-style" service instance. 719 */ 720 int 721 instance_is_wait_style(restarter_inst_t *inst) 722 { 723 assert(PTHREAD_MUTEX_HELD(&inst->ri_lock)); 724 return ((inst->ri_flags & RINST_STYLE_MASK) == RINST_WAIT); 725 } 726 727 /* 728 * instance_is_transient_style() 729 * 730 * Returns 1 if the given instance is a transient service instance. 731 */ 732 int 733 instance_is_transient_style(restarter_inst_t *inst) 734 { 735 assert(PTHREAD_MUTEX_HELD(&inst->ri_lock)); 736 return ((inst->ri_flags & RINST_STYLE_MASK) == RINST_TRANSIENT); 737 } 738 739 /* 740 * instance_in_transition() 741 * Returns 1 if instance is in transition, 0 if not 742 */ 743 int 744 instance_in_transition(restarter_inst_t *inst) 745 { 746 assert(PTHREAD_MUTEX_HELD(&inst->ri_lock)); 747 if (inst->ri_i.i_next_state == RESTARTER_STATE_NONE) 748 return (0); 749 return (1); 750 } 751 752 /* 753 * Returns 754 * 0 - success 755 * ECONNRESET - success, but h was rebound 756 */ 757 int 758 restarter_instance_update_states(scf_handle_t *h, restarter_inst_t *ri, 759 restarter_instance_state_t new_state, 760 restarter_instance_state_t new_state_next, restarter_error_t err, char *aux) 761 { 762 protocol_states_t *states; 763 int e; 764 uint_t retry_count = 0, msecs = ALLOC_DELAY; 765 boolean_t rebound = B_FALSE; 766 767 assert(PTHREAD_MUTEX_HELD(&ri->ri_lock)); 768 769 retry: 770 e = _restarter_commit_states(h, &ri->ri_i, new_state, new_state_next, 771 aux); 772 switch (e) { 773 case 0: 774 break; 775 776 case ENOMEM: 777 ++retry_count; 778 if (retry_count < ALLOC_RETRY) { 779 (void) poll(NULL, 0, msecs); 780 msecs *= ALLOC_DELAY_MULT; 781 goto retry; 782 } 783 784 /* Like startd_alloc(). */ 785 uu_die("Insufficient memory.\n"); 786 /* NOTREACHED */ 787 788 case ECONNABORTED: 789 libscf_handle_rebind(h); 790 rebound = B_TRUE; 791 goto retry; 792 793 case EPERM: 794 case EACCES: 795 case EROFS: 796 log_error(LOG_NOTICE, "Could not commit state change for %s " 797 "to repository: %s.\n", ri->ri_i.i_fmri, strerror(e)); 798 /* FALLTHROUGH */ 799 800 case ENOENT: 801 ri->ri_i.i_state = new_state; 802 ri->ri_i.i_next_state = new_state_next; 803 break; 804 805 case EINVAL: 806 default: 807 bad_error("_restarter_commit_states", e); 808 } 809 810 states = startd_alloc(sizeof (protocol_states_t)); 811 states->ps_state = new_state; 812 states->ps_state_next = new_state_next; 813 states->ps_err = err; 814 graph_protocol_send_event(ri->ri_i.i_fmri, GRAPH_UPDATE_STATE_CHANGE, 815 (void *)states); 816 817 if (new_state == RESTARTER_STATE_ONLINE) 818 ri->ri_post_online_hook(); 819 820 return (rebound ? ECONNRESET : 0); 821 } 822 823 void 824 restarter_mark_pending_snapshot(const char *fmri, uint_t flag) 825 { 826 restarter_inst_t *inst; 827 828 assert(flag == RINST_RETAKE_RUNNING || flag == RINST_RETAKE_START); 829 830 inst = inst_lookup_by_name(fmri); 831 if (inst == NULL) 832 return; 833 834 inst->ri_flags |= flag; 835 836 MUTEX_UNLOCK(&inst->ri_lock); 837 } 838 839 static void 840 restarter_take_pending_snapshots(scf_handle_t *h) 841 { 842 restarter_inst_t *inst; 843 int r; 844 845 MUTEX_LOCK(&instance_list.ril_lock); 846 847 for (inst = uu_list_first(instance_list.ril_instance_list); 848 inst != NULL; 849 inst = uu_list_next(instance_list.ril_instance_list, inst)) { 850 const char *fmri; 851 scf_instance_t *sinst = NULL; 852 853 MUTEX_LOCK(&inst->ri_lock); 854 855 /* 856 * This is where we'd check inst->ri_method_thread and if it 857 * were nonzero we'd wait in anticipation of another thread 858 * executing a method for inst. Doing so with the instance_list 859 * locked, though, leads to deadlock. Since taking a snapshot 860 * during that window won't hurt anything, we'll just continue. 861 */ 862 863 fmri = inst->ri_i.i_fmri; 864 865 if (inst->ri_flags & RINST_RETAKE_RUNNING) { 866 scf_snapshot_t *rsnap; 867 868 (void) libscf_fmri_get_instance(h, fmri, &sinst); 869 870 rsnap = libscf_get_or_make_running_snapshot(sinst, 871 fmri, B_FALSE); 872 873 scf_instance_destroy(sinst); 874 875 if (rsnap != NULL) 876 inst->ri_flags &= ~RINST_RETAKE_RUNNING; 877 878 scf_snapshot_destroy(rsnap); 879 } 880 881 if (inst->ri_flags & RINST_RETAKE_START) { 882 switch (r = libscf_snapshots_poststart(h, fmri, 883 B_FALSE)) { 884 case 0: 885 case ENOENT: 886 inst->ri_flags &= ~RINST_RETAKE_START; 887 break; 888 889 case ECONNABORTED: 890 break; 891 892 case EACCES: 893 default: 894 bad_error("libscf_snapshots_poststart", r); 895 } 896 } 897 898 MUTEX_UNLOCK(&inst->ri_lock); 899 } 900 901 MUTEX_UNLOCK(&instance_list.ril_lock); 902 } 903 904 /* ARGSUSED */ 905 void * 906 restarter_post_fsminimal_thread(void *unused) 907 { 908 scf_handle_t *h; 909 int r; 910 911 h = libscf_handle_create_bound_loop(); 912 913 for (;;) { 914 r = libscf_create_self(h); 915 if (r == 0) 916 break; 917 918 assert(r == ECONNABORTED); 919 libscf_handle_rebind(h); 920 } 921 922 restarter_take_pending_snapshots(h); 923 924 (void) scf_handle_unbind(h); 925 scf_handle_destroy(h); 926 927 return (NULL); 928 } 929 930 /* 931 * returns 1 if instance is already started, 0 if not 932 */ 933 static int 934 instance_started(restarter_inst_t *inst) 935 { 936 int ret; 937 938 assert(PTHREAD_MUTEX_HELD(&inst->ri_lock)); 939 940 if (inst->ri_i.i_state == RESTARTER_STATE_ONLINE || 941 inst->ri_i.i_state == RESTARTER_STATE_DEGRADED) 942 ret = 1; 943 else 944 ret = 0; 945 946 return (ret); 947 } 948 949 /* 950 * int stop_instance() 951 * 952 * Stop the instance identified by the instance given as the second argument, 953 * for the cause stated. 954 * 955 * Returns 956 * 0 - success 957 * -1 - inst is in transition 958 */ 959 static int 960 stop_instance(scf_handle_t *local_handle, restarter_inst_t *inst, 961 stop_cause_t cause) 962 { 963 fork_info_t *info; 964 const char *cp; 965 int err; 966 restarter_error_t re; 967 968 assert(PTHREAD_MUTEX_HELD(&inst->ri_lock)); 969 assert(inst->ri_method_thread == 0); 970 971 switch (cause) { 972 case RSTOP_EXIT: 973 re = RERR_RESTART; 974 cp = "all processes in service exited"; 975 break; 976 case RSTOP_CORE: 977 re = RERR_FAULT; 978 cp = "process dumped core"; 979 break; 980 case RSTOP_SIGNAL: 981 re = RERR_FAULT; 982 cp = "process received fatal signal from outside the service"; 983 break; 984 case RSTOP_HWERR: 985 re = RERR_FAULT; 986 cp = "process killed due to uncorrectable hardware error"; 987 break; 988 case RSTOP_DEPENDENCY: 989 re = RERR_RESTART; 990 cp = "dependency activity requires stop"; 991 break; 992 case RSTOP_DISABLE: 993 re = RERR_RESTART; 994 cp = "service disabled"; 995 break; 996 case RSTOP_RESTART: 997 re = RERR_RESTART; 998 cp = "service restarting"; 999 break; 1000 default: 1001 #ifndef NDEBUG 1002 (void) fprintf(stderr, "Unknown cause %d at %s:%d.\n", 1003 cause, __FILE__, __LINE__); 1004 #endif 1005 abort(); 1006 } 1007 1008 /* Services in the disabled and maintenance state are ignored */ 1009 if (inst->ri_i.i_state == RESTARTER_STATE_MAINT || 1010 inst->ri_i.i_state == RESTARTER_STATE_DISABLED) { 1011 log_framework(LOG_DEBUG, 1012 "%s: stop_instance -> is maint/disabled\n", 1013 inst->ri_i.i_fmri); 1014 return (0); 1015 } 1016 1017 /* Already stopped instances are left alone */ 1018 if (instance_started(inst) == 0) { 1019 log_framework(LOG_DEBUG, "Restarter: %s is already stopped.\n", 1020 inst->ri_i.i_fmri); 1021 return (0); 1022 } 1023 1024 if (instance_in_transition(inst)) { 1025 /* requeue event by returning -1 */ 1026 log_framework(LOG_DEBUG, 1027 "Restarter: Not stopping %s, in transition.\n", 1028 inst->ri_i.i_fmri); 1029 return (-1); 1030 } 1031 1032 log_instance(inst, B_TRUE, "Stopping because %s.", cp); 1033 1034 log_framework(re == RERR_FAULT ? LOG_INFO : LOG_DEBUG, 1035 "%s: Instance stopping because %s.\n", inst->ri_i.i_fmri, cp); 1036 1037 if (instance_is_wait_style(inst) && cause == RSTOP_EXIT) { 1038 /* 1039 * No need to stop instance, as child has exited; remove 1040 * contract and move the instance to the offline state. 1041 */ 1042 switch (err = restarter_instance_update_states(local_handle, 1043 inst, inst->ri_i.i_state, RESTARTER_STATE_OFFLINE, re, 1044 NULL)) { 1045 case 0: 1046 case ECONNRESET: 1047 break; 1048 1049 default: 1050 bad_error("restarter_instance_update_states", err); 1051 } 1052 1053 (void) update_fault_count(inst, FAULT_COUNT_RESET); 1054 1055 if (inst->ri_i.i_primary_ctid != 0) { 1056 inst->ri_m_inst = 1057 safe_scf_instance_create(local_handle); 1058 inst->ri_mi_deleted = B_FALSE; 1059 1060 libscf_reget_instance(inst); 1061 method_remove_contract(inst, B_TRUE, B_TRUE); 1062 1063 scf_instance_destroy(inst->ri_m_inst); 1064 inst->ri_m_inst = NULL; 1065 } 1066 1067 switch (err = restarter_instance_update_states(local_handle, 1068 inst, inst->ri_i.i_next_state, RESTARTER_STATE_NONE, re, 1069 NULL)) { 1070 case 0: 1071 case ECONNRESET: 1072 break; 1073 1074 default: 1075 bad_error("restarter_instance_update_states", err); 1076 } 1077 1078 return (0); 1079 } 1080 1081 switch (err = restarter_instance_update_states(local_handle, inst, 1082 inst->ri_i.i_state, inst->ri_i.i_enabled ? RESTARTER_STATE_OFFLINE : 1083 RESTARTER_STATE_DISABLED, RERR_NONE, NULL)) { 1084 case 0: 1085 case ECONNRESET: 1086 break; 1087 1088 default: 1089 bad_error("restarter_instance_update_states", err); 1090 } 1091 1092 info = startd_zalloc(sizeof (fork_info_t)); 1093 1094 info->sf_id = inst->ri_id; 1095 info->sf_method_type = METHOD_STOP; 1096 info->sf_event_type = re; 1097 inst->ri_method_thread = startd_thread_create(method_thread, info); 1098 1099 return (0); 1100 } 1101 1102 /* 1103 * Returns 1104 * ENOENT - fmri is not in instance_list 1105 * 0 - success 1106 * ECONNRESET - success, though handle was rebound 1107 * -1 - instance is in transition 1108 */ 1109 int 1110 stop_instance_fmri(scf_handle_t *h, const char *fmri, uint_t flags) 1111 { 1112 restarter_inst_t *rip; 1113 int r; 1114 1115 rip = inst_lookup_by_name(fmri); 1116 if (rip == NULL) 1117 return (ENOENT); 1118 1119 r = stop_instance(h, rip, flags); 1120 1121 MUTEX_UNLOCK(&rip->ri_lock); 1122 1123 return (r); 1124 } 1125 1126 static void 1127 unmaintain_instance(scf_handle_t *h, restarter_inst_t *rip, 1128 unmaint_cause_t cause) 1129 { 1130 ctid_t ctid; 1131 scf_instance_t *inst; 1132 int r; 1133 uint_t tries = 0, msecs = ALLOC_DELAY; 1134 const char *cp; 1135 1136 assert(PTHREAD_MUTEX_HELD(&rip->ri_lock)); 1137 1138 if (rip->ri_i.i_state != RESTARTER_STATE_MAINT) { 1139 log_error(LOG_DEBUG, "Restarter: " 1140 "Ignoring maintenance off command because %s is not in the " 1141 "maintenance state.\n", rip->ri_i.i_fmri); 1142 return; 1143 } 1144 1145 switch (cause) { 1146 case RUNMAINT_CLEAR: 1147 cp = "clear requested"; 1148 break; 1149 case RUNMAINT_DISABLE: 1150 cp = "disable requested"; 1151 break; 1152 default: 1153 #ifndef NDEBUG 1154 (void) fprintf(stderr, "Uncaught case for %d at %s:%d.\n", 1155 cause, __FILE__, __LINE__); 1156 #endif 1157 abort(); 1158 } 1159 1160 log_instance(rip, B_TRUE, "Leaving maintenance because %s.", 1161 cp); 1162 log_framework(LOG_DEBUG, "%s: Instance leaving maintenance because " 1163 "%s.\n", rip->ri_i.i_fmri, cp); 1164 1165 (void) restarter_instance_update_states(h, rip, RESTARTER_STATE_UNINIT, 1166 RESTARTER_STATE_NONE, RERR_RESTART, NULL); 1167 1168 /* 1169 * If we did ADMIN_MAINT_ON_IMMEDIATE, then there might still be 1170 * a primary contract. 1171 */ 1172 if (rip->ri_i.i_primary_ctid == 0) 1173 return; 1174 1175 ctid = rip->ri_i.i_primary_ctid; 1176 contract_abandon(ctid); 1177 rip->ri_i.i_primary_ctid = 0; 1178 1179 rep_retry: 1180 switch (r = libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst)) { 1181 case 0: 1182 break; 1183 1184 case ECONNABORTED: 1185 libscf_handle_rebind(h); 1186 goto rep_retry; 1187 1188 case ENOENT: 1189 /* Must have been deleted. */ 1190 return; 1191 1192 case EINVAL: 1193 case ENOTSUP: 1194 default: 1195 bad_error("libscf_handle_rebind", r); 1196 } 1197 1198 again: 1199 r = restarter_remove_contract(inst, ctid, RESTARTER_CONTRACT_PRIMARY); 1200 switch (r) { 1201 case 0: 1202 break; 1203 1204 case ENOMEM: 1205 ++tries; 1206 if (tries < ALLOC_RETRY) { 1207 (void) poll(NULL, 0, msecs); 1208 msecs *= ALLOC_DELAY_MULT; 1209 goto again; 1210 } 1211 1212 uu_die("Insufficient memory.\n"); 1213 /* NOTREACHED */ 1214 1215 case ECONNABORTED: 1216 scf_instance_destroy(inst); 1217 libscf_handle_rebind(h); 1218 goto rep_retry; 1219 1220 case ECANCELED: 1221 break; 1222 1223 case EPERM: 1224 case EACCES: 1225 case EROFS: 1226 log_error(LOG_INFO, 1227 "Could not remove contract id %lu for %s (%s).\n", ctid, 1228 rip->ri_i.i_fmri, strerror(r)); 1229 break; 1230 1231 case EINVAL: 1232 case EBADF: 1233 default: 1234 bad_error("restarter_remove_contract", r); 1235 } 1236 1237 scf_instance_destroy(inst); 1238 } 1239 1240 /* 1241 * enable_inst() 1242 * Set inst->ri_i.i_enabled. Expects 'e' to be _ENABLE, _DISABLE, or 1243 * _ADMIN_DISABLE. If the event is _ENABLE and inst is uninitialized or 1244 * disabled, move it to offline. If the event is _DISABLE or 1245 * _ADMIN_DISABLE, make sure inst will move to disabled. 1246 * 1247 * Returns 1248 * 0 - success 1249 * ECONNRESET - h was rebound 1250 */ 1251 static int 1252 enable_inst(scf_handle_t *h, restarter_inst_t *inst, restarter_event_type_t e) 1253 { 1254 restarter_instance_state_t state; 1255 int r; 1256 1257 assert(PTHREAD_MUTEX_HELD(&inst->ri_lock)); 1258 assert(e == RESTARTER_EVENT_TYPE_ADMIN_DISABLE || 1259 e == RESTARTER_EVENT_TYPE_DISABLE || 1260 e == RESTARTER_EVENT_TYPE_ENABLE); 1261 assert(instance_in_transition(inst) == 0); 1262 1263 state = inst->ri_i.i_state; 1264 1265 if (e == RESTARTER_EVENT_TYPE_ENABLE) { 1266 inst->ri_i.i_enabled = 1; 1267 1268 if (state == RESTARTER_STATE_UNINIT || 1269 state == RESTARTER_STATE_DISABLED) { 1270 /* 1271 * B_FALSE: Don't log an error if the log_instance() 1272 * fails because it will fail on the miniroot before 1273 * install-discovery runs. 1274 */ 1275 log_instance(inst, B_FALSE, "Enabled."); 1276 log_framework(LOG_DEBUG, "%s: Instance enabled.\n", 1277 inst->ri_i.i_fmri); 1278 (void) restarter_instance_update_states(h, inst, 1279 RESTARTER_STATE_OFFLINE, RESTARTER_STATE_NONE, 1280 RERR_NONE, NULL); 1281 } else { 1282 log_framework(LOG_DEBUG, "Restarter: " 1283 "Not changing state of %s for enable command.\n", 1284 inst->ri_i.i_fmri); 1285 } 1286 } else { 1287 inst->ri_i.i_enabled = 0; 1288 1289 switch (state) { 1290 case RESTARTER_STATE_ONLINE: 1291 case RESTARTER_STATE_DEGRADED: 1292 r = stop_instance(h, inst, RSTOP_DISABLE); 1293 return (r == ECONNRESET ? 0 : r); 1294 1295 case RESTARTER_STATE_OFFLINE: 1296 case RESTARTER_STATE_UNINIT: 1297 if (inst->ri_i.i_primary_ctid != 0) { 1298 inst->ri_m_inst = safe_scf_instance_create(h); 1299 inst->ri_mi_deleted = B_FALSE; 1300 1301 libscf_reget_instance(inst); 1302 method_remove_contract(inst, B_TRUE, B_TRUE); 1303 1304 scf_instance_destroy(inst->ri_m_inst); 1305 } 1306 /* B_FALSE: See log_instance(..., "Enabled."); above */ 1307 log_instance(inst, B_FALSE, "Disabled."); 1308 log_framework(LOG_DEBUG, "%s: Instance disabled.\n", 1309 inst->ri_i.i_fmri); 1310 (void) restarter_instance_update_states(h, inst, 1311 RESTARTER_STATE_DISABLED, RESTARTER_STATE_NONE, 1312 RERR_RESTART, NULL); 1313 return (0); 1314 1315 case RESTARTER_STATE_DISABLED: 1316 break; 1317 1318 case RESTARTER_STATE_MAINT: 1319 /* 1320 * We only want to pull the instance out of maintenance 1321 * if the disable is on adminstrative request. The 1322 * graph engine sends _DISABLE events whenever a 1323 * service isn't in the disabled state, and we don't 1324 * want to pull the service out of maintenance if, 1325 * for example, it is there due to a dependency cycle. 1326 */ 1327 if (e == RESTARTER_EVENT_TYPE_ADMIN_DISABLE) 1328 unmaintain_instance(h, inst, RUNMAINT_DISABLE); 1329 break; 1330 1331 default: 1332 #ifndef NDEBUG 1333 (void) fprintf(stderr, "Restarter instance %s has " 1334 "unknown state %d.\n", inst->ri_i.i_fmri, state); 1335 #endif 1336 abort(); 1337 } 1338 } 1339 1340 return (0); 1341 } 1342 1343 static void 1344 start_instance(scf_handle_t *local_handle, restarter_inst_t *inst) 1345 { 1346 fork_info_t *info; 1347 1348 assert(PTHREAD_MUTEX_HELD(&inst->ri_lock)); 1349 assert(instance_in_transition(inst) == 0); 1350 assert(inst->ri_method_thread == 0); 1351 1352 log_framework(LOG_DEBUG, "%s: trying to start instance\n", 1353 inst->ri_i.i_fmri); 1354 1355 /* Services in the disabled and maintenance state are ignored */ 1356 if (inst->ri_i.i_state == RESTARTER_STATE_MAINT || 1357 inst->ri_i.i_state == RESTARTER_STATE_DISABLED || 1358 inst->ri_i.i_enabled == 0) { 1359 log_framework(LOG_DEBUG, 1360 "%s: start_instance -> is maint/disabled\n", 1361 inst->ri_i.i_fmri); 1362 return; 1363 } 1364 1365 /* Already started instances are left alone */ 1366 if (instance_started(inst) == 1) { 1367 log_framework(LOG_DEBUG, 1368 "%s: start_instance -> is already started\n", 1369 inst->ri_i.i_fmri); 1370 return; 1371 } 1372 1373 log_framework(LOG_DEBUG, "%s: starting instance.\n", inst->ri_i.i_fmri); 1374 1375 (void) restarter_instance_update_states(local_handle, inst, 1376 inst->ri_i.i_state, RESTARTER_STATE_ONLINE, RERR_NONE, NULL); 1377 1378 info = startd_zalloc(sizeof (fork_info_t)); 1379 1380 info->sf_id = inst->ri_id; 1381 info->sf_method_type = METHOD_START; 1382 info->sf_event_type = RERR_NONE; 1383 inst->ri_method_thread = startd_thread_create(method_thread, info); 1384 } 1385 1386 static void 1387 maintain_instance(scf_handle_t *h, restarter_inst_t *rip, int immediate, 1388 const char *aux) 1389 { 1390 fork_info_t *info; 1391 1392 assert(PTHREAD_MUTEX_HELD(&rip->ri_lock)); 1393 assert(aux != NULL); 1394 assert(rip->ri_method_thread == 0); 1395 1396 log_instance(rip, B_TRUE, "Stopping for maintenance due to %s.", aux); 1397 log_framework(LOG_DEBUG, "%s: stopping for maintenance due to %s.\n", 1398 rip->ri_i.i_fmri, aux); 1399 1400 /* Services in the maintenance state are ignored */ 1401 if (rip->ri_i.i_state == RESTARTER_STATE_MAINT) { 1402 log_framework(LOG_DEBUG, 1403 "%s: maintain_instance -> is already in maintenance\n", 1404 rip->ri_i.i_fmri); 1405 return; 1406 } 1407 1408 if (immediate || !instance_started(rip)) { 1409 if (rip->ri_i.i_primary_ctid != 0) { 1410 rip->ri_m_inst = safe_scf_instance_create(h); 1411 rip->ri_mi_deleted = B_FALSE; 1412 1413 libscf_reget_instance(rip); 1414 method_remove_contract(rip, B_TRUE, B_TRUE); 1415 1416 scf_instance_destroy(rip->ri_m_inst); 1417 } 1418 1419 (void) restarter_instance_update_states(h, rip, 1420 RESTARTER_STATE_MAINT, RESTARTER_STATE_NONE, RERR_RESTART, 1421 (char *)aux); 1422 return; 1423 } 1424 1425 (void) restarter_instance_update_states(h, rip, rip->ri_i.i_state, 1426 RESTARTER_STATE_MAINT, RERR_NONE, (char *)aux); 1427 1428 info = startd_zalloc(sizeof (*info)); 1429 info->sf_id = rip->ri_id; 1430 info->sf_method_type = METHOD_STOP; 1431 info->sf_event_type = RERR_RESTART; 1432 rip->ri_method_thread = startd_thread_create(method_thread, info); 1433 } 1434 1435 static void 1436 refresh_instance(scf_handle_t *h, restarter_inst_t *rip) 1437 { 1438 scf_instance_t *inst; 1439 scf_snapshot_t *snap; 1440 fork_info_t *info; 1441 int r; 1442 1443 assert(PTHREAD_MUTEX_HELD(&rip->ri_lock)); 1444 1445 log_instance(rip, B_TRUE, "Rereading configuration."); 1446 log_framework(LOG_DEBUG, "%s: rereading configuration.\n", 1447 rip->ri_i.i_fmri); 1448 1449 rep_retry: 1450 r = libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst); 1451 switch (r) { 1452 case 0: 1453 break; 1454 1455 case ECONNABORTED: 1456 libscf_handle_rebind(h); 1457 goto rep_retry; 1458 1459 case ENOENT: 1460 /* Must have been deleted. */ 1461 return; 1462 1463 case EINVAL: 1464 case ENOTSUP: 1465 default: 1466 bad_error("libscf_fmri_get_instance", r); 1467 } 1468 1469 snap = libscf_get_running_snapshot(inst); 1470 1471 r = libscf_get_startd_properties(inst, snap, &rip->ri_flags, 1472 &rip->ri_utmpx_prefix); 1473 switch (r) { 1474 case 0: 1475 log_framework(LOG_DEBUG, "%s is a %s-style service\n", 1476 rip->ri_i.i_fmri, service_style(rip->ri_flags)); 1477 break; 1478 1479 case ECONNABORTED: 1480 scf_instance_destroy(inst); 1481 scf_snapshot_destroy(snap); 1482 libscf_handle_rebind(h); 1483 goto rep_retry; 1484 1485 case ECANCELED: 1486 case ENOENT: 1487 /* Succeed in anticipation of REMOVE_INSTANCE. */ 1488 break; 1489 1490 default: 1491 bad_error("libscf_get_startd_properties", r); 1492 } 1493 1494 if (instance_started(rip)) { 1495 /* Refresh does not change the state. */ 1496 (void) restarter_instance_update_states(h, rip, 1497 rip->ri_i.i_state, rip->ri_i.i_state, RERR_NONE, NULL); 1498 1499 info = startd_zalloc(sizeof (*info)); 1500 info->sf_id = rip->ri_id; 1501 info->sf_method_type = METHOD_REFRESH; 1502 info->sf_event_type = RERR_REFRESH; 1503 1504 assert(rip->ri_method_thread == 0); 1505 rip->ri_method_thread = 1506 startd_thread_create(method_thread, info); 1507 } 1508 1509 scf_snapshot_destroy(snap); 1510 scf_instance_destroy(inst); 1511 } 1512 1513 const char *event_names[] = { "INVALID", "ADD_INSTANCE", "REMOVE_INSTANCE", 1514 "ENABLE", "DISABLE", "ADMIN_DEGRADED", "ADMIN_REFRESH", 1515 "ADMIN_RESTART", "ADMIN_MAINT_OFF", "ADMIN_MAINT_ON", 1516 "ADMIN_MAINT_ON_IMMEDIATE", "STOP", "START", "DEPENDENCY_CYCLE", 1517 "INVALID_DEPENDENCY", "ADMIN_DISABLE" 1518 }; 1519 1520 /* 1521 * void *restarter_process_events() 1522 * 1523 * Called in a separate thread to process the events on an instance's 1524 * queue. Empties the queue completely, and tries to keep the thread 1525 * around for a little while after the queue is empty to save on 1526 * startup costs. 1527 */ 1528 static void * 1529 restarter_process_events(void *arg) 1530 { 1531 scf_handle_t *h; 1532 restarter_instance_qentry_t *event; 1533 restarter_inst_t *rip; 1534 char *fmri = (char *)arg; 1535 struct timespec to; 1536 1537 assert(fmri != NULL); 1538 1539 h = libscf_handle_create_bound_loop(); 1540 1541 /* grab the queue lock */ 1542 rip = inst_lookup_queue(fmri); 1543 if (rip == NULL) 1544 goto out; 1545 1546 again: 1547 1548 while ((event = uu_list_first(rip->ri_queue)) != NULL) { 1549 restarter_inst_t *inst; 1550 1551 /* drop the queue lock */ 1552 MUTEX_UNLOCK(&rip->ri_queue_lock); 1553 1554 /* 1555 * Grab the inst lock -- this waits until any outstanding 1556 * method finishes running. 1557 */ 1558 inst = inst_lookup_by_name(fmri); 1559 if (inst == NULL) { 1560 /* Getting deleted in the middle isn't an error. */ 1561 goto cont; 1562 } 1563 1564 assert(instance_in_transition(inst) == 0); 1565 1566 /* process the event */ 1567 switch (event->riq_type) { 1568 case RESTARTER_EVENT_TYPE_ENABLE: 1569 case RESTARTER_EVENT_TYPE_DISABLE: 1570 case RESTARTER_EVENT_TYPE_ADMIN_DISABLE: 1571 (void) enable_inst(h, inst, event->riq_type); 1572 break; 1573 1574 case RESTARTER_EVENT_TYPE_REMOVE_INSTANCE: 1575 restarter_delete_inst(inst); 1576 inst = NULL; 1577 goto cont; 1578 1579 case RESTARTER_EVENT_TYPE_STOP: 1580 (void) stop_instance(h, inst, RSTOP_DEPENDENCY); 1581 break; 1582 1583 case RESTARTER_EVENT_TYPE_START: 1584 start_instance(h, inst); 1585 break; 1586 1587 case RESTARTER_EVENT_TYPE_DEPENDENCY_CYCLE: 1588 maintain_instance(h, inst, 0, "dependency_cycle"); 1589 break; 1590 1591 case RESTARTER_EVENT_TYPE_INVALID_DEPENDENCY: 1592 maintain_instance(h, inst, 0, "invalid_dependency"); 1593 break; 1594 1595 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON: 1596 maintain_instance(h, inst, 0, "administrative_request"); 1597 break; 1598 1599 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON_IMMEDIATE: 1600 maintain_instance(h, inst, 1, "administrative_request"); 1601 break; 1602 1603 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_OFF: 1604 unmaintain_instance(h, inst, RUNMAINT_CLEAR); 1605 break; 1606 1607 case RESTARTER_EVENT_TYPE_ADMIN_REFRESH: 1608 refresh_instance(h, inst); 1609 break; 1610 1611 case RESTARTER_EVENT_TYPE_ADMIN_DEGRADED: 1612 log_framework(LOG_WARNING, "Restarter: " 1613 "%s command (for %s) unimplemented.\n", 1614 event_names[event->riq_type], inst->ri_i.i_fmri); 1615 break; 1616 1617 case RESTARTER_EVENT_TYPE_ADMIN_RESTART: 1618 if (!instance_started(inst)) { 1619 log_framework(LOG_DEBUG, "Restarter: " 1620 "Not restarting %s; not running.\n", 1621 inst->ri_i.i_fmri); 1622 } else { 1623 /* 1624 * Stop the instance. If it can be restarted, 1625 * the graph engine will send a new event. 1626 */ 1627 (void) stop_instance(h, inst, RSTOP_RESTART); 1628 } 1629 break; 1630 1631 case RESTARTER_EVENT_TYPE_ADD_INSTANCE: 1632 default: 1633 #ifndef NDEBUG 1634 uu_warn("%s:%d: Bad restarter event %d. " 1635 "Aborting.\n", __FILE__, __LINE__, event->riq_type); 1636 #endif 1637 abort(); 1638 } 1639 1640 assert(inst != NULL); 1641 MUTEX_UNLOCK(&inst->ri_lock); 1642 1643 cont: 1644 /* grab the queue lock */ 1645 rip = inst_lookup_queue(fmri); 1646 if (rip == NULL) 1647 goto out; 1648 1649 /* delete the event */ 1650 uu_list_remove(rip->ri_queue, event); 1651 startd_free(event, sizeof (restarter_instance_qentry_t)); 1652 } 1653 1654 assert(rip != NULL); 1655 1656 /* 1657 * Try to preserve the thread for a little while for future use. 1658 */ 1659 to.tv_sec = 3; 1660 to.tv_nsec = 0; 1661 (void) pthread_cond_reltimedwait_np(&rip->ri_queue_cv, 1662 &rip->ri_queue_lock, &to); 1663 1664 if (uu_list_first(rip->ri_queue) != NULL) 1665 goto again; 1666 1667 rip->ri_queue_thread = 0; 1668 MUTEX_UNLOCK(&rip->ri_queue_lock); 1669 out: 1670 (void) scf_handle_unbind(h); 1671 scf_handle_destroy(h); 1672 free(fmri); 1673 return (NULL); 1674 } 1675 1676 static int 1677 is_admin_event(restarter_event_type_t t) { 1678 1679 switch (t) { 1680 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON: 1681 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON_IMMEDIATE: 1682 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_OFF: 1683 case RESTARTER_EVENT_TYPE_ADMIN_REFRESH: 1684 case RESTARTER_EVENT_TYPE_ADMIN_DEGRADED: 1685 case RESTARTER_EVENT_TYPE_ADMIN_RESTART: 1686 return (1); 1687 default: 1688 return (0); 1689 } 1690 } 1691 1692 static void 1693 restarter_queue_event(restarter_inst_t *ri, restarter_protocol_event_t *e) 1694 { 1695 restarter_instance_qentry_t *qe; 1696 int r; 1697 1698 assert(PTHREAD_MUTEX_HELD(&ri->ri_queue_lock)); 1699 assert(!PTHREAD_MUTEX_HELD(&ri->ri_lock)); 1700 1701 qe = startd_zalloc(sizeof (restarter_instance_qentry_t)); 1702 qe->riq_type = e->rpe_type; 1703 1704 uu_list_node_init(qe, &qe->riq_link, restarter_queue_pool); 1705 r = uu_list_insert_before(ri->ri_queue, NULL, qe); 1706 assert(r == 0); 1707 } 1708 1709 /* 1710 * void *restarter_event_thread() 1711 * 1712 * Handle incoming graph events by placing them on a per-instance 1713 * queue. We can't lock the main part of the instance structure, so 1714 * just modify the seprarately locked event queue portion. 1715 */ 1716 /*ARGSUSED*/ 1717 static void * 1718 restarter_event_thread(void *unused) 1719 { 1720 scf_handle_t *h; 1721 1722 /* 1723 * This is a new thread, and thus, gets its own handle 1724 * to the repository. 1725 */ 1726 h = libscf_handle_create_bound_loop(); 1727 1728 MUTEX_LOCK(&ru->restarter_update_lock); 1729 1730 /*CONSTCOND*/ 1731 while (1) { 1732 restarter_protocol_event_t *e; 1733 1734 while (ru->restarter_update_wakeup == 0) 1735 (void) pthread_cond_wait(&ru->restarter_update_cv, 1736 &ru->restarter_update_lock); 1737 1738 ru->restarter_update_wakeup = 0; 1739 1740 while ((e = restarter_event_dequeue()) != NULL) { 1741 restarter_inst_t *rip; 1742 char *fmri; 1743 1744 MUTEX_UNLOCK(&ru->restarter_update_lock); 1745 1746 /* 1747 * ADD_INSTANCE is special: there's likely no 1748 * instance structure yet, so we need to handle the 1749 * addition synchronously. 1750 */ 1751 switch (e->rpe_type) { 1752 case RESTARTER_EVENT_TYPE_ADD_INSTANCE: 1753 if (restarter_insert_inst(h, e->rpe_inst) != 0) 1754 log_error(LOG_INFO, "Restarter: " 1755 "Could not add %s.\n", e->rpe_inst); 1756 1757 MUTEX_LOCK(&st->st_load_lock); 1758 if (--st->st_load_instances == 0) 1759 (void) pthread_cond_broadcast( 1760 &st->st_load_cv); 1761 MUTEX_UNLOCK(&st->st_load_lock); 1762 1763 goto nolookup; 1764 } 1765 1766 /* 1767 * Lookup the instance, locking only the event queue. 1768 * Can't grab ri_lock here because it might be held 1769 * by a long-running method. 1770 */ 1771 rip = inst_lookup_queue(e->rpe_inst); 1772 if (rip == NULL) { 1773 log_error(LOG_INFO, "Restarter: " 1774 "Ignoring %s command for unknown service " 1775 "%s.\n", event_names[e->rpe_type], 1776 e->rpe_inst); 1777 goto nolookup; 1778 } 1779 1780 /* Keep ADMIN events from filling up the queue. */ 1781 if (is_admin_event(e->rpe_type) && 1782 uu_list_numnodes(rip->ri_queue) > 1783 RINST_QUEUE_THRESHOLD) { 1784 MUTEX_UNLOCK(&rip->ri_queue_lock); 1785 log_instance(rip, B_TRUE, "Instance event " 1786 "queue overflow. Dropping administrative " 1787 "request."); 1788 log_framework(LOG_DEBUG, "%s: Instance event " 1789 "queue overflow. Dropping administrative " 1790 "request.\n", rip->ri_i.i_fmri); 1791 goto nolookup; 1792 } 1793 1794 /* Now add the event to the instance queue. */ 1795 restarter_queue_event(rip, e); 1796 1797 if (rip->ri_queue_thread == 0) { 1798 /* 1799 * Start a thread if one isn't already 1800 * running. 1801 */ 1802 fmri = safe_strdup(e->rpe_inst); 1803 rip->ri_queue_thread = startd_thread_create( 1804 restarter_process_events, (void *)fmri); 1805 } else { 1806 /* 1807 * Signal the existing thread that there's 1808 * a new event. 1809 */ 1810 (void) pthread_cond_broadcast( 1811 &rip->ri_queue_cv); 1812 } 1813 1814 MUTEX_UNLOCK(&rip->ri_queue_lock); 1815 nolookup: 1816 restarter_event_release(e); 1817 1818 MUTEX_LOCK(&ru->restarter_update_lock); 1819 } 1820 } 1821 1822 /* 1823 * Unreachable for now -- there's currently no graceful cleanup 1824 * called on exit(). 1825 */ 1826 (void) scf_handle_unbind(h); 1827 scf_handle_destroy(h); 1828 return (NULL); 1829 } 1830 1831 static restarter_inst_t * 1832 contract_to_inst(ctid_t ctid) 1833 { 1834 restarter_inst_t *inst; 1835 int id; 1836 1837 id = lookup_inst_by_contract(ctid); 1838 if (id == -1) 1839 return (NULL); 1840 1841 inst = inst_lookup_by_id(id); 1842 if (inst != NULL) { 1843 /* 1844 * Since ri_lock isn't held by the contract id lookup, this 1845 * instance may have been restarted and now be in a new 1846 * contract, making the old contract no longer valid for this 1847 * instance. 1848 */ 1849 if (ctid != inst->ri_i.i_primary_ctid) { 1850 MUTEX_UNLOCK(&inst->ri_lock); 1851 inst = NULL; 1852 } 1853 } 1854 return (inst); 1855 } 1856 1857 /* 1858 * void contract_action() 1859 * Take action on contract events. 1860 */ 1861 static void 1862 contract_action(scf_handle_t *h, restarter_inst_t *inst, ctid_t id, 1863 uint32_t type) 1864 { 1865 const char *fmri = inst->ri_i.i_fmri; 1866 1867 assert(PTHREAD_MUTEX_HELD(&inst->ri_lock)); 1868 1869 /* 1870 * If startd has stopped this contract, there is no need to 1871 * stop it again. 1872 */ 1873 if (inst->ri_i.i_primary_ctid > 0 && 1874 inst->ri_i.i_primary_ctid_stopped) 1875 return; 1876 1877 if ((type & (CT_PR_EV_EMPTY | CT_PR_EV_CORE | CT_PR_EV_SIGNAL 1878 | CT_PR_EV_HWERR)) == 0) { 1879 /* 1880 * There shouldn't be other events, since that's not how we set 1881 * the terms. Thus, just log an error and drive on. 1882 */ 1883 log_framework(LOG_NOTICE, 1884 "%s: contract %ld received unexpected critical event " 1885 "(%d)\n", fmri, id, type); 1886 return; 1887 } 1888 1889 assert(instance_in_transition(inst) == 0); 1890 1891 if (instance_is_wait_style(inst)) { 1892 /* 1893 * We ignore all events; if they impact the 1894 * process we're monitoring, then the 1895 * wait_thread will stop the instance. 1896 */ 1897 log_framework(LOG_DEBUG, 1898 "%s: ignoring contract event on wait-style service\n", 1899 fmri); 1900 } else { 1901 /* 1902 * A CT_PR_EV_EMPTY event is an RSTOP_EXIT request. 1903 */ 1904 switch (type) { 1905 case CT_PR_EV_EMPTY: 1906 (void) stop_instance(h, inst, RSTOP_EXIT); 1907 break; 1908 case CT_PR_EV_CORE: 1909 (void) stop_instance(h, inst, RSTOP_CORE); 1910 break; 1911 case CT_PR_EV_SIGNAL: 1912 (void) stop_instance(h, inst, RSTOP_SIGNAL); 1913 break; 1914 case CT_PR_EV_HWERR: 1915 (void) stop_instance(h, inst, RSTOP_HWERR); 1916 break; 1917 } 1918 } 1919 } 1920 1921 /* 1922 * void *restarter_contract_event_thread(void *) 1923 * Listens to the process contract bundle for critical events, taking action 1924 * on events from contracts we know we are responsible for. 1925 */ 1926 /*ARGSUSED*/ 1927 static void * 1928 restarter_contracts_event_thread(void *unused) 1929 { 1930 int fd, err; 1931 scf_handle_t *local_handle; 1932 1933 /* 1934 * Await graph load completion. That is, stop here, until we've scanned 1935 * the repository for contract - instance associations. 1936 */ 1937 MUTEX_LOCK(&st->st_load_lock); 1938 while (!(st->st_load_complete && st->st_load_instances == 0)) 1939 (void) pthread_cond_wait(&st->st_load_cv, &st->st_load_lock); 1940 MUTEX_UNLOCK(&st->st_load_lock); 1941 1942 /* 1943 * This is a new thread, and thus, gets its own handle 1944 * to the repository. 1945 */ 1946 if ((local_handle = libscf_handle_create_bound(SCF_VERSION)) == NULL) 1947 uu_die("Unable to bind a new repository handle: %s\n", 1948 scf_strerror(scf_error())); 1949 1950 fd = open64(CTFS_ROOT "/process/pbundle", O_RDONLY); 1951 if (fd == -1) 1952 uu_die("process bundle open failed"); 1953 1954 /* 1955 * Make sure we get all events (including those generated by configd 1956 * before this thread was started). 1957 */ 1958 err = ct_event_reset(fd); 1959 assert(err == 0); 1960 1961 for (;;) { 1962 int efd, sfd; 1963 ct_evthdl_t ev; 1964 uint32_t type; 1965 ctevid_t evid; 1966 ct_stathdl_t status; 1967 ctid_t ctid; 1968 restarter_inst_t *inst; 1969 uint64_t cookie; 1970 1971 if (err = ct_event_read_critical(fd, &ev)) { 1972 log_error(LOG_WARNING, 1973 "Error reading next contract event: %s", 1974 strerror(err)); 1975 continue; 1976 } 1977 1978 evid = ct_event_get_evid(ev); 1979 ctid = ct_event_get_ctid(ev); 1980 type = ct_event_get_type(ev); 1981 1982 /* Fetch cookie. */ 1983 if ((sfd = contract_open(ctid, "process", "status", O_RDONLY)) 1984 < 0) { 1985 ct_event_free(ev); 1986 continue; 1987 } 1988 1989 if (err = ct_status_read(sfd, CTD_COMMON, &status)) { 1990 log_framework(LOG_WARNING, "Could not get status for " 1991 "contract %ld: %s\n", ctid, strerror(err)); 1992 1993 startd_close(sfd); 1994 ct_event_free(ev); 1995 continue; 1996 } 1997 1998 cookie = ct_status_get_cookie(status); 1999 2000 ct_status_free(status); 2001 2002 startd_close(sfd); 2003 2004 /* 2005 * svc.configd(1M) restart handling performed by the 2006 * fork_configd_thread. We don't acknowledge, as that thread 2007 * will do so. 2008 */ 2009 if (cookie == CONFIGD_COOKIE) { 2010 ct_event_free(ev); 2011 continue; 2012 } 2013 2014 inst = contract_to_inst(ctid); 2015 if (inst == NULL) { 2016 /* 2017 * This can happen if we receive an EMPTY 2018 * event for an abandoned contract. 2019 */ 2020 log_framework(LOG_DEBUG, 2021 "Received event %d for unknown contract id " 2022 "%ld\n", type, ctid); 2023 } else { 2024 log_framework(LOG_DEBUG, 2025 "Received event %d for contract id " 2026 "%ld (%s)\n", type, ctid, 2027 inst->ri_i.i_fmri); 2028 2029 contract_action(local_handle, inst, ctid, type); 2030 2031 MUTEX_UNLOCK(&inst->ri_lock); 2032 } 2033 2034 efd = contract_open(ct_event_get_ctid(ev), "process", "ctl", 2035 O_WRONLY); 2036 if (efd != -1) { 2037 (void) ct_ctl_ack(efd, evid); 2038 startd_close(efd); 2039 } 2040 2041 ct_event_free(ev); 2042 2043 } 2044 2045 /*NOTREACHED*/ 2046 return (NULL); 2047 } 2048 2049 /* 2050 * Timeout queue, processed by restarter_timeouts_event_thread(). 2051 */ 2052 timeout_queue_t *timeouts; 2053 static uu_list_pool_t *timeout_pool; 2054 2055 typedef struct timeout_update { 2056 pthread_mutex_t tu_lock; 2057 pthread_cond_t tu_cv; 2058 int tu_wakeup; 2059 } timeout_update_t; 2060 2061 timeout_update_t *tu; 2062 2063 static const char *timeout_ovr_svcs[] = { 2064 "svc:/system/manifest-import:default", 2065 "svc:/network/initial:default", 2066 "svc:/network/service:default", 2067 "svc:/system/rmtmpfiles:default", 2068 "svc:/network/loopback:default", 2069 "svc:/network/physical:default", 2070 "svc:/system/device/local:default", 2071 "svc:/system/metainit:default", 2072 "svc:/system/filesystem/usr:default", 2073 "svc:/system/filesystem/minimal:default", 2074 "svc:/system/filesystem/local:default", 2075 NULL 2076 }; 2077 2078 int 2079 is_timeout_ovr(restarter_inst_t *inst) 2080 { 2081 int i; 2082 2083 for (i = 0; timeout_ovr_svcs[i] != NULL; ++i) { 2084 if (strcmp(inst->ri_i.i_fmri, timeout_ovr_svcs[i]) == 0) { 2085 log_instance(inst, B_TRUE, "Timeout override by " 2086 "svc.startd. Using infinite timeout"); 2087 return (1); 2088 } 2089 } 2090 2091 return (0); 2092 } 2093 2094 /*ARGSUSED*/ 2095 static int 2096 timeout_compare(const void *lc_arg, const void *rc_arg, void *private) 2097 { 2098 hrtime_t t1 = ((const timeout_entry_t *)lc_arg)->te_timeout; 2099 hrtime_t t2 = ((const timeout_entry_t *)rc_arg)->te_timeout; 2100 2101 if (t1 > t2) 2102 return (1); 2103 else if (t1 < t2) 2104 return (-1); 2105 return (0); 2106 } 2107 2108 void 2109 timeout_init() 2110 { 2111 timeouts = startd_zalloc(sizeof (timeout_queue_t)); 2112 2113 (void) pthread_mutex_init(&timeouts->tq_lock, &mutex_attrs); 2114 2115 timeout_pool = startd_list_pool_create("timeouts", 2116 sizeof (timeout_entry_t), offsetof(timeout_entry_t, te_link), 2117 timeout_compare, UU_LIST_POOL_DEBUG); 2118 assert(timeout_pool != NULL); 2119 2120 timeouts->tq_list = startd_list_create(timeout_pool, 2121 timeouts, UU_LIST_SORTED); 2122 assert(timeouts->tq_list != NULL); 2123 2124 tu = startd_zalloc(sizeof (timeout_update_t)); 2125 (void) pthread_cond_init(&tu->tu_cv, NULL); 2126 (void) pthread_mutex_init(&tu->tu_lock, &mutex_attrs); 2127 } 2128 2129 void 2130 timeout_insert(restarter_inst_t *inst, ctid_t cid, uint64_t timeout_sec) 2131 { 2132 hrtime_t now, timeout; 2133 timeout_entry_t *entry; 2134 uu_list_index_t idx; 2135 2136 assert(PTHREAD_MUTEX_HELD(&inst->ri_lock)); 2137 2138 now = gethrtime(); 2139 2140 /* 2141 * If we overflow LLONG_MAX, we're never timing out anyways, so 2142 * just return. 2143 */ 2144 if (timeout_sec >= (LLONG_MAX - now) / 1000000000LL) { 2145 log_instance(inst, B_TRUE, "timeout_seconds too large, " 2146 "treating as infinite."); 2147 return; 2148 } 2149 2150 /* hrtime is in nanoseconds. Convert timeout_sec. */ 2151 timeout = now + (timeout_sec * 1000000000LL); 2152 2153 entry = startd_alloc(sizeof (timeout_entry_t)); 2154 entry->te_timeout = timeout; 2155 entry->te_ctid = cid; 2156 entry->te_fmri = safe_strdup(inst->ri_i.i_fmri); 2157 entry->te_logstem = safe_strdup(inst->ri_logstem); 2158 entry->te_fired = 0; 2159 /* Insert the calculated timeout time onto the queue. */ 2160 MUTEX_LOCK(&timeouts->tq_lock); 2161 (void) uu_list_find(timeouts->tq_list, entry, NULL, &idx); 2162 uu_list_node_init(entry, &entry->te_link, timeout_pool); 2163 uu_list_insert(timeouts->tq_list, entry, idx); 2164 MUTEX_UNLOCK(&timeouts->tq_lock); 2165 2166 assert(inst->ri_timeout == NULL); 2167 inst->ri_timeout = entry; 2168 2169 MUTEX_LOCK(&tu->tu_lock); 2170 tu->tu_wakeup = 1; 2171 (void) pthread_cond_broadcast(&tu->tu_cv); 2172 MUTEX_UNLOCK(&tu->tu_lock); 2173 } 2174 2175 2176 void 2177 timeout_remove(restarter_inst_t *inst, ctid_t cid) 2178 { 2179 assert(PTHREAD_MUTEX_HELD(&inst->ri_lock)); 2180 2181 if (inst->ri_timeout == NULL) 2182 return; 2183 2184 assert(inst->ri_timeout->te_ctid == cid); 2185 2186 MUTEX_LOCK(&timeouts->tq_lock); 2187 uu_list_remove(timeouts->tq_list, inst->ri_timeout); 2188 MUTEX_UNLOCK(&timeouts->tq_lock); 2189 2190 free(inst->ri_timeout->te_fmri); 2191 free(inst->ri_timeout->te_logstem); 2192 startd_free(inst->ri_timeout, sizeof (timeout_entry_t)); 2193 inst->ri_timeout = NULL; 2194 } 2195 2196 static int 2197 timeout_now() 2198 { 2199 timeout_entry_t *e; 2200 hrtime_t now; 2201 int ret; 2202 2203 now = gethrtime(); 2204 2205 /* 2206 * Walk through the (sorted) timeouts list. While the timeout 2207 * at the head of the list is <= the current time, kill the 2208 * method. 2209 */ 2210 MUTEX_LOCK(&timeouts->tq_lock); 2211 2212 for (e = uu_list_first(timeouts->tq_list); 2213 e != NULL && e->te_timeout <= now; 2214 e = uu_list_next(timeouts->tq_list, e)) { 2215 log_framework(LOG_WARNING, "%s: Method or service exit timed " 2216 "out. Killing contract %ld.\n", e->te_fmri, e->te_ctid); 2217 log_instance_fmri(e->te_fmri, e->te_logstem, B_TRUE, 2218 "Method or service exit timed out. Killing contract %ld", 2219 e->te_ctid); 2220 e->te_fired = 1; 2221 (void) contract_kill(e->te_ctid, SIGKILL, e->te_fmri); 2222 } 2223 2224 if (uu_list_numnodes(timeouts->tq_list) > 0) 2225 ret = 0; 2226 else 2227 ret = -1; 2228 2229 MUTEX_UNLOCK(&timeouts->tq_lock); 2230 2231 return (ret); 2232 } 2233 2234 /* 2235 * void *restarter_timeouts_event_thread(void *) 2236 * Responsible for monitoring the method timeouts. This thread must 2237 * be started before any methods are called. 2238 */ 2239 /*ARGSUSED*/ 2240 static void * 2241 restarter_timeouts_event_thread(void *unused) 2242 { 2243 /* 2244 * Timeouts are entered on a priority queue, which is processed by 2245 * this thread. As timeouts are specified in seconds, we'll do 2246 * the necessary processing every second, as long as the queue 2247 * is not empty. 2248 */ 2249 2250 /*CONSTCOND*/ 2251 while (1) { 2252 /* 2253 * As long as the timeout list isn't empty, process it 2254 * every second. 2255 */ 2256 if (timeout_now() == 0) { 2257 (void) sleep(1); 2258 continue; 2259 } 2260 2261 /* The list is empty, wait until we have more timeouts. */ 2262 MUTEX_LOCK(&tu->tu_lock); 2263 2264 while (tu->tu_wakeup == 0) 2265 (void) pthread_cond_wait(&tu->tu_cv, &tu->tu_lock); 2266 2267 tu->tu_wakeup = 0; 2268 MUTEX_UNLOCK(&tu->tu_lock); 2269 } 2270 2271 return (NULL); 2272 } 2273 2274 void 2275 restarter_start() 2276 { 2277 (void) startd_thread_create(restarter_timeouts_event_thread, NULL); 2278 (void) startd_thread_create(restarter_event_thread, NULL); 2279 (void) startd_thread_create(restarter_contracts_event_thread, NULL); 2280 (void) startd_thread_create(wait_thread, NULL); 2281 } 2282 2283 2284 void 2285 restarter_init() 2286 { 2287 restarter_instance_pool = startd_list_pool_create("restarter_instances", 2288 sizeof (restarter_inst_t), offsetof(restarter_inst_t, 2289 ri_link), restarter_instance_compare, UU_LIST_POOL_DEBUG); 2290 (void) memset(&instance_list, 0, sizeof (instance_list)); 2291 2292 (void) pthread_mutex_init(&instance_list.ril_lock, &mutex_attrs); 2293 instance_list.ril_instance_list = startd_list_create( 2294 restarter_instance_pool, &instance_list, UU_LIST_SORTED); 2295 2296 restarter_queue_pool = startd_list_pool_create( 2297 "restarter_instance_queue", sizeof (restarter_instance_qentry_t), 2298 offsetof(restarter_instance_qentry_t, riq_link), NULL, 2299 UU_LIST_POOL_DEBUG); 2300 2301 contract_list_pool = startd_list_pool_create( 2302 "contract_list", sizeof (contract_entry_t), 2303 offsetof(contract_entry_t, ce_link), NULL, 2304 UU_LIST_POOL_DEBUG); 2305 contract_hash_init(); 2306 2307 log_framework(LOG_DEBUG, "Initialized restarter\n"); 2308 } 2309