xref: /titanic_52/usr/src/cmd/svc/startd/restarter.c (revision 380789fc80376bd1573770361cb177a08c7e3524)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * restarter.c - service manipulation
30  *
31  * This component manages services whose restarter is svc.startd, the standard
32  * restarter.  It translates restarter protocol events from the graph engine
33  * into actions on processes, as a delegated restarter would do.
34  *
35  * The master restarter manages a number of always-running threads:
36  *   - restarter event thread: events from the graph engine
37  *   - timeout thread: thread to fire queued timeouts
38  *   - contract thread: thread to handle contract events
39  *   - wait thread: thread to handle wait-based services
40  *
41  * The other threads are created as-needed:
42  *   - per-instance method threads
43  *   - per-instance event processing threads
44  *
45  * The interaction of all threads must result in the following conditions
46  * being satisfied (on a per-instance basis):
47  *   - restarter events must be processed in order
48  *   - method execution must be serialized
49  *   - instance delete must be held until outstanding methods are complete
50  *   - contract events shouldn't be processed while a method is running
51  *   - timeouts should fire even when a method is running
52  *
53  * Service instances are represented by restarter_inst_t's and are kept in the
54  * instance_list list.
55  *
56  * Service States
57  *   The current state of a service instance is kept in
58  *   restarter_inst_t->ri_i.i_state.  If transition to a new state could take
59  *   some time, then before we effect the transition we set
60  *   restarter_inst_t->ri_i.i_next_state to the target state, and afterwards we
61  *   rotate i_next_state to i_state and set i_next_state to
62  *   RESTARTER_STATE_NONE.  So usually i_next_state is _NONE when ri_lock is not
63  *   held.  The exception is when we launch methods, which are done with
64  *   a separate thread.  To keep any other threads from grabbing ri_lock before
65  *   method_thread() does, we set ri_method_thread to the thread id of the
66  *   method thread, and when it is nonzero any thread with a different thread id
67  *   waits on ri_method_cv.
68  *
69  * Method execution is serialized by blocking on ri_method_cv in
70  * inst_lookup_by_id() and waiting for a 0 value of ri_method_thread.  This
71  * also prevents the instance structure from being deleted until all
72  * outstanding operations such as method_thread() have finished.
73  *
74  * Lock ordering:
75  *
76  * dgraph_lock [can be held when taking:]
77  *   utmpx_lock
78  *   dictionary->dict_lock
79  *   st->st_load_lock
80  *   wait_info_lock
81  *   ru->restarter_update_lock
82  *     restarter_queue->rpeq_lock
83  *   instance_list.ril_lock
84  *     inst->ri_lock
85  *   st->st_configd_live_lock
86  *
87  * instance_list.ril_lock
88  *   graph_queue->gpeq_lock
89  *   gu->gu_lock
90  *   st->st_configd_live_lock
91  *   dictionary->dict_lock
92  *   inst->ri_lock
93  *     graph_queue->gpeq_lock
94  *     gu->gu_lock
95  *     tu->tu_lock
96  *     tq->tq_lock
97  *     inst->ri_queue_lock
98  *       wait_info_lock
99  *       bp->cb_lock
100  *     utmpx_lock
101  *
102  * single_user_thread_lock
103  *   wait_info_lock
104  *   utmpx_lock
105  *
106  * gu_freeze_lock
107  *
108  * logbuf_mutex nests inside pretty much everything.
109  */
110 
111 #include <sys/contract/process.h>
112 #include <sys/ctfs.h>
113 #include <sys/stat.h>
114 #include <sys/time.h>
115 #include <sys/types.h>
116 #include <sys/uio.h>
117 #include <sys/wait.h>
118 #include <assert.h>
119 #include <errno.h>
120 #include <fcntl.h>
121 #include <libcontract.h>
122 #include <libcontract_priv.h>
123 #include <libintl.h>
124 #include <librestart.h>
125 #include <librestart_priv.h>
126 #include <libuutil.h>
127 #include <limits.h>
128 #include <poll.h>
129 #include <port.h>
130 #include <pthread.h>
131 #include <stdarg.h>
132 #include <stdio.h>
133 #include <strings.h>
134 #include <unistd.h>
135 
136 #include "startd.h"
137 #include "protocol.h"
138 
139 static uu_list_pool_t *restarter_instance_pool;
140 static restarter_instance_list_t instance_list;
141 
142 static uu_list_pool_t *restarter_queue_pool;
143 
144 /*ARGSUSED*/
145 static int
146 restarter_instance_compare(const void *lc_arg, const void *rc_arg,
147     void *private)
148 {
149 	int lc_id = ((const restarter_inst_t *)lc_arg)->ri_id;
150 	int rc_id = *(int *)rc_arg;
151 
152 	if (lc_id > rc_id)
153 		return (1);
154 	if (lc_id < rc_id)
155 		return (-1);
156 	return (0);
157 }
158 
159 static restarter_inst_t *
160 inst_lookup_by_name(const char *name)
161 {
162 	int id;
163 
164 	id = dict_lookup_byname(name);
165 	if (id == -1)
166 		return (NULL);
167 
168 	return (inst_lookup_by_id(id));
169 }
170 
171 restarter_inst_t *
172 inst_lookup_by_id(int id)
173 {
174 	restarter_inst_t *inst;
175 
176 	MUTEX_LOCK(&instance_list.ril_lock);
177 	inst = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
178 	if (inst != NULL)
179 		MUTEX_LOCK(&inst->ri_lock);
180 	MUTEX_UNLOCK(&instance_list.ril_lock);
181 
182 	if (inst != NULL) {
183 		while (inst->ri_method_thread != 0 &&
184 		    !pthread_equal(inst->ri_method_thread, pthread_self())) {
185 			++inst->ri_method_waiters;
186 			(void) pthread_cond_wait(&inst->ri_method_cv,
187 			    &inst->ri_lock);
188 			assert(inst->ri_method_waiters > 0);
189 			--inst->ri_method_waiters;
190 		}
191 	}
192 
193 	return (inst);
194 }
195 
196 static restarter_inst_t *
197 inst_lookup_queue(const char *name)
198 {
199 	int id;
200 	restarter_inst_t *inst;
201 
202 	id = dict_lookup_byname(name);
203 	if (id == -1)
204 		return (NULL);
205 
206 	MUTEX_LOCK(&instance_list.ril_lock);
207 	inst = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
208 	if (inst != NULL)
209 		MUTEX_LOCK(&inst->ri_queue_lock);
210 	MUTEX_UNLOCK(&instance_list.ril_lock);
211 
212 	return (inst);
213 }
214 
215 const char *
216 service_style(int flags)
217 {
218 	switch (flags & RINST_STYLE_MASK) {
219 	case RINST_CONTRACT:	return ("contract");
220 	case RINST_TRANSIENT:	return ("transient");
221 	case RINST_WAIT:	return ("wait");
222 
223 	default:
224 #ifndef NDEBUG
225 		uu_warn("%s:%d: Bad flags 0x%x.\n", __FILE__, __LINE__, flags);
226 #endif
227 		abort();
228 		/* NOTREACHED */
229 	}
230 }
231 
232 /*
233  * Fails with ECONNABORTED or ECANCELED.
234  */
235 static int
236 check_contract(restarter_inst_t *inst, boolean_t primary,
237     scf_instance_t *scf_inst)
238 {
239 	ctid_t *ctidp;
240 	int fd, r;
241 
242 	ctidp = primary ? &inst->ri_i.i_primary_ctid :
243 	    &inst->ri_i.i_transient_ctid;
244 
245 	assert(*ctidp >= 1);
246 
247 	fd = contract_open(*ctidp, NULL, "status", O_RDONLY);
248 	if (fd >= 0) {
249 		r = close(fd);
250 		assert(r == 0);
251 		return (0);
252 	}
253 
254 	r = restarter_remove_contract(scf_inst, *ctidp, primary ?
255 	    RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT);
256 	switch (r) {
257 	case 0:
258 	case ECONNABORTED:
259 	case ECANCELED:
260 		*ctidp = 0;
261 		return (r);
262 
263 	case ENOMEM:
264 		uu_die("Out of memory\n");
265 		/* NOTREACHED */
266 
267 	case EPERM:
268 		uu_die("Insufficient privilege.\n");
269 		/* NOTREACHED */
270 
271 	case EACCES:
272 		uu_die("Repository backend access denied.\n");
273 		/* NOTREACHED */
274 
275 	case EROFS:
276 		log_error(LOG_INFO, "Could not remove unusable contract id %ld "
277 		    "for %s from repository.\n", *ctidp, inst->ri_i.i_fmri);
278 		return (0);
279 
280 	case EINVAL:
281 	case EBADF:
282 	default:
283 		assert(0);
284 		abort();
285 		/* NOTREACHED */
286 	}
287 }
288 
289 static int stop_instance(scf_handle_t *, restarter_inst_t *, stop_cause_t);
290 
291 /*
292  * int restarter_insert_inst(scf_handle_t *, char *)
293  *   If the inst is already in the restarter list, return its id.  If the inst
294  *   is not in the restarter list, initialize a restarter_inst_t, initialize its
295  *   states, insert it into the list, and return 0.
296  *
297  *   Fails with
298  *     ENOENT - name is not in the repository
299  */
300 static int
301 restarter_insert_inst(scf_handle_t *h, const char *name)
302 {
303 	int id, r;
304 	restarter_inst_t *inst;
305 	uu_list_index_t idx;
306 	scf_service_t *scf_svc;
307 	scf_instance_t *scf_inst;
308 	scf_snapshot_t *snap = NULL;
309 	scf_propertygroup_t *pg;
310 	char *svc_name, *inst_name;
311 	char logfilebuf[PATH_MAX];
312 	char *c;
313 	boolean_t do_commit_states;
314 	restarter_instance_state_t state, next_state;
315 	protocol_states_t *ps;
316 	pid_t start_pid;
317 
318 	MUTEX_LOCK(&instance_list.ril_lock);
319 
320 	/*
321 	 * We don't use inst_lookup_by_name() here because we want the lookup
322 	 * & insert to be atomic.
323 	 */
324 	id = dict_lookup_byname(name);
325 	if (id != -1) {
326 		inst = uu_list_find(instance_list.ril_instance_list, &id, NULL,
327 		    &idx);
328 		if (inst != NULL) {
329 			MUTEX_UNLOCK(&instance_list.ril_lock);
330 			return (0);
331 		}
332 	}
333 
334 	/* Allocate an instance */
335 	inst = startd_zalloc(sizeof (restarter_inst_t));
336 	inst->ri_utmpx_prefix = startd_alloc(max_scf_value_size);
337 	inst->ri_utmpx_prefix[0] = '\0';
338 
339 	inst->ri_i.i_fmri = startd_alloc(strlen(name) + 1);
340 	(void) strcpy((char *)inst->ri_i.i_fmri, name);
341 
342 	inst->ri_queue = startd_list_create(restarter_queue_pool, inst, 0);
343 
344 	/*
345 	 * id shouldn't be -1 since we use the same dictionary as graph.c, but
346 	 * just in case.
347 	 */
348 	inst->ri_id = (id != -1 ? id : dict_insert(name));
349 
350 	special_online_hooks_get(name, &inst->ri_pre_online_hook,
351 	    &inst->ri_post_online_hook, &inst->ri_post_offline_hook);
352 
353 	scf_svc = safe_scf_service_create(h);
354 	scf_inst = safe_scf_instance_create(h);
355 	pg = safe_scf_pg_create(h);
356 	svc_name = startd_alloc(max_scf_name_size);
357 	inst_name = startd_alloc(max_scf_name_size);
358 
359 rep_retry:
360 	if (snap != NULL)
361 		scf_snapshot_destroy(snap);
362 	if (inst->ri_logstem != NULL)
363 		startd_free(inst->ri_logstem, PATH_MAX);
364 	if (inst->ri_common_name != NULL)
365 		startd_free(inst->ri_common_name, max_scf_value_size);
366 	if (inst->ri_C_common_name != NULL)
367 		startd_free(inst->ri_C_common_name, max_scf_value_size);
368 	snap = NULL;
369 	inst->ri_logstem = NULL;
370 	inst->ri_common_name = NULL;
371 	inst->ri_C_common_name = NULL;
372 
373 	if (scf_handle_decode_fmri(h, name, NULL, scf_svc, scf_inst, NULL,
374 	    NULL, SCF_DECODE_FMRI_EXACT) != 0) {
375 		switch (scf_error()) {
376 		case SCF_ERROR_CONNECTION_BROKEN:
377 			libscf_handle_rebind(h);
378 			goto rep_retry;
379 
380 		case SCF_ERROR_NOT_FOUND:
381 			goto deleted;
382 		}
383 
384 		uu_die("Can't decode FMRI %s: %s\n", name,
385 		    scf_strerror(scf_error()));
386 	}
387 
388 	/*
389 	 * If there's no running snapshot, then we execute using the editing
390 	 * snapshot.  Pending snapshots will be taken later.
391 	 */
392 	snap = libscf_get_running_snapshot(scf_inst);
393 
394 	if ((scf_service_get_name(scf_svc, svc_name, max_scf_name_size) < 0) ||
395 	    (scf_instance_get_name(scf_inst, inst_name, max_scf_name_size) <
396 	    0)) {
397 		switch (scf_error()) {
398 		case SCF_ERROR_NOT_SET:
399 			break;
400 
401 		case SCF_ERROR_CONNECTION_BROKEN:
402 			libscf_handle_rebind(h);
403 			goto rep_retry;
404 
405 		default:
406 			assert(0);
407 			abort();
408 		}
409 
410 		goto deleted;
411 	}
412 
413 	(void) snprintf(logfilebuf, PATH_MAX, "%s:%s", svc_name, inst_name);
414 	for (c = logfilebuf; *c != '\0'; c++)
415 		if (*c == '/')
416 			*c = '-';
417 
418 	inst->ri_logstem = startd_alloc(PATH_MAX);
419 	(void) snprintf(inst->ri_logstem, PATH_MAX, "%s%s", logfilebuf,
420 	    LOG_SUFFIX);
421 
422 	/*
423 	 * If the restarter group is missing, use uninit/none.  Otherwise,
424 	 * we're probably being restarted & don't want to mess up the states
425 	 * that are there.
426 	 */
427 	state = RESTARTER_STATE_UNINIT;
428 	next_state = RESTARTER_STATE_NONE;
429 
430 	r = scf_instance_get_pg(scf_inst, SCF_PG_RESTARTER, pg);
431 	if (r != 0) {
432 		switch (scf_error()) {
433 		case SCF_ERROR_CONNECTION_BROKEN:
434 			libscf_handle_rebind(h);
435 			goto rep_retry;
436 
437 		case SCF_ERROR_NOT_SET:
438 			goto deleted;
439 
440 		case SCF_ERROR_NOT_FOUND:
441 			/*
442 			 * This shouldn't happen since the graph engine should
443 			 * have initialized the state to uninitialized/none if
444 			 * there was no restarter pg.  In case somebody
445 			 * deleted it, though....
446 			 */
447 			do_commit_states = B_TRUE;
448 			break;
449 
450 		default:
451 			assert(0);
452 			abort();
453 		}
454 	} else {
455 		r = libscf_read_states(pg, &state, &next_state);
456 		if (r != 0) {
457 			do_commit_states = B_TRUE;
458 		} else {
459 			if (next_state != RESTARTER_STATE_NONE) {
460 				/*
461 				 * Force next_state to _NONE since we
462 				 * don't look for method processes.
463 				 */
464 				next_state = RESTARTER_STATE_NONE;
465 				do_commit_states = B_TRUE;
466 			} else {
467 				/*
468 				 * Inform the restarter of our state without
469 				 * changing the STIME in the repository.
470 				 */
471 				ps = startd_alloc(sizeof (*ps));
472 				inst->ri_i.i_state = ps->ps_state = state;
473 				inst->ri_i.i_next_state = ps->ps_state_next =
474 				    next_state;
475 
476 				graph_protocol_send_event(inst->ri_i.i_fmri,
477 				    GRAPH_UPDATE_STATE_CHANGE, ps);
478 
479 				do_commit_states = B_FALSE;
480 			}
481 		}
482 	}
483 
484 	switch (libscf_get_startd_properties(scf_inst, snap, &inst->ri_flags,
485 	    &inst->ri_utmpx_prefix)) {
486 	case 0:
487 		break;
488 
489 	case ECONNABORTED:
490 		libscf_handle_rebind(h);
491 		goto rep_retry;
492 
493 	case ECANCELED:
494 		goto deleted;
495 
496 	case ENOENT:
497 		/*
498 		 * This is odd, because the graph engine should have required
499 		 * the general property group.  So we'll just use default
500 		 * flags in anticipation of the graph engine sending us
501 		 * REMOVE_INSTANCE when it finds out that the general property
502 		 * group has been deleted.
503 		 */
504 		inst->ri_flags = RINST_CONTRACT;
505 		break;
506 
507 	default:
508 		assert(0);
509 		abort();
510 	}
511 
512 	switch (libscf_get_template_values(scf_inst, snap,
513 	    &inst->ri_common_name, &inst->ri_C_common_name)) {
514 	case 0:
515 		break;
516 
517 	case ECONNABORTED:
518 		libscf_handle_rebind(h);
519 		goto rep_retry;
520 
521 	case ECANCELED:
522 		goto deleted;
523 
524 	case ECHILD:
525 	case ENOENT:
526 		break;
527 
528 	default:
529 		assert(0);
530 		abort();
531 	}
532 
533 	switch (libscf_read_method_ids(h, scf_inst, inst->ri_i.i_fmri,
534 	    &inst->ri_i.i_primary_ctid, &inst->ri_i.i_transient_ctid,
535 	    &start_pid)) {
536 	case 0:
537 		break;
538 
539 	case ECONNABORTED:
540 		libscf_handle_rebind(h);
541 		goto rep_retry;
542 
543 	case ECANCELED:
544 		goto deleted;
545 
546 	default:
547 		assert(0);
548 		abort();
549 	}
550 
551 	if (inst->ri_i.i_primary_ctid >= 1) {
552 		contract_hash_store(inst->ri_i.i_primary_ctid, inst->ri_id);
553 
554 		switch (check_contract(inst, B_TRUE, scf_inst)) {
555 		case 0:
556 			break;
557 
558 		case ECONNABORTED:
559 			libscf_handle_rebind(h);
560 			goto rep_retry;
561 
562 		case ECANCELED:
563 			goto deleted;
564 
565 		default:
566 			assert(0);
567 			abort();
568 		}
569 	}
570 
571 	if (inst->ri_i.i_transient_ctid >= 1) {
572 		switch (check_contract(inst, B_FALSE, scf_inst)) {
573 		case 0:
574 			break;
575 
576 		case ECONNABORTED:
577 			libscf_handle_rebind(h);
578 			goto rep_retry;
579 
580 		case ECANCELED:
581 			goto deleted;
582 
583 		default:
584 			assert(0);
585 			abort();
586 		}
587 	}
588 
589 	/* No more failures we live through, so add it to the list. */
590 	(void) pthread_mutex_init(&inst->ri_lock, &mutex_attrs);
591 	(void) pthread_mutex_init(&inst->ri_queue_lock, &mutex_attrs);
592 	MUTEX_LOCK(&inst->ri_lock);
593 	MUTEX_LOCK(&inst->ri_queue_lock);
594 
595 	(void) pthread_cond_init(&inst->ri_method_cv, NULL);
596 
597 	uu_list_node_init(inst, &inst->ri_link, restarter_instance_pool);
598 	uu_list_insert(instance_list.ril_instance_list, inst, idx);
599 	MUTEX_UNLOCK(&instance_list.ril_lock);
600 
601 	if (start_pid != -1 &&
602 	    (inst->ri_flags & RINST_STYLE_MASK) == RINST_WAIT) {
603 		int ret;
604 		ret = wait_register(start_pid, inst->ri_i.i_fmri, 0, 1);
605 		if (ret == -1) {
606 			/*
607 			 * Implication:  if we can't reregister the
608 			 * instance, we will start another one.  Two
609 			 * instances may or may not result in a resource
610 			 * conflict.
611 			 */
612 			log_error(LOG_WARNING,
613 			    "%s: couldn't reregister %ld for wait\n",
614 			    inst->ri_i.i_fmri, start_pid);
615 		} else if (ret == 1) {
616 			/*
617 			 * Leading PID has exited.
618 			 */
619 			(void) stop_instance(h, inst, RSTOP_EXIT);
620 		}
621 	}
622 
623 
624 	scf_pg_destroy(pg);
625 
626 	if (do_commit_states)
627 		(void) restarter_instance_update_states(h, inst, state,
628 		    next_state, RERR_NONE, NULL);
629 
630 	log_framework(LOG_DEBUG, "%s is a %s-style service\n", name,
631 	    service_style(inst->ri_flags));
632 
633 	MUTEX_UNLOCK(&inst->ri_queue_lock);
634 	MUTEX_UNLOCK(&inst->ri_lock);
635 
636 	startd_free(svc_name, max_scf_name_size);
637 	startd_free(inst_name, max_scf_name_size);
638 	scf_snapshot_destroy(snap);
639 	scf_instance_destroy(scf_inst);
640 	scf_service_destroy(scf_svc);
641 
642 	log_framework(LOG_DEBUG, "%s: inserted instance into restarter list\n",
643 	    name);
644 
645 	return (0);
646 
647 deleted:
648 	MUTEX_UNLOCK(&instance_list.ril_lock);
649 	startd_free(inst_name, max_scf_name_size);
650 	startd_free(svc_name, max_scf_name_size);
651 	if (snap != NULL)
652 		scf_snapshot_destroy(snap);
653 	scf_pg_destroy(pg);
654 	scf_instance_destroy(scf_inst);
655 	scf_service_destroy(scf_svc);
656 	startd_free((void *)inst->ri_i.i_fmri, strlen(inst->ri_i.i_fmri) + 1);
657 	uu_list_destroy(inst->ri_queue);
658 	if (inst->ri_logstem != NULL)
659 		startd_free(inst->ri_logstem, PATH_MAX);
660 	if (inst->ri_common_name != NULL)
661 		startd_free(inst->ri_common_name, max_scf_value_size);
662 	if (inst->ri_C_common_name != NULL)
663 		startd_free(inst->ri_C_common_name, max_scf_value_size);
664 	startd_free(inst->ri_utmpx_prefix, max_scf_value_size);
665 	startd_free(inst, sizeof (restarter_inst_t));
666 	return (ENOENT);
667 }
668 
669 static void
670 restarter_delete_inst(restarter_inst_t *ri)
671 {
672 	int id;
673 	restarter_inst_t *rip;
674 	void *cookie = NULL;
675 	restarter_instance_qentry_t *e;
676 
677 	assert(PTHREAD_MUTEX_HELD(&ri->ri_lock));
678 
679 	/*
680 	 * Must drop the instance lock so we can pick up the instance_list
681 	 * lock & remove the instance.
682 	 */
683 	id = ri->ri_id;
684 	MUTEX_UNLOCK(&ri->ri_lock);
685 
686 	MUTEX_LOCK(&instance_list.ril_lock);
687 
688 	rip = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
689 	if (rip == NULL) {
690 		MUTEX_UNLOCK(&instance_list.ril_lock);
691 		return;
692 	}
693 
694 	assert(ri == rip);
695 
696 	uu_list_remove(instance_list.ril_instance_list, ri);
697 
698 	log_framework(LOG_DEBUG, "%s: deleted instance from restarter list\n",
699 	    ri->ri_i.i_fmri);
700 
701 	MUTEX_UNLOCK(&instance_list.ril_lock);
702 
703 	/*
704 	 * We can lock the instance without holding the instance_list lock
705 	 * since we removed the instance from the list.
706 	 */
707 	MUTEX_LOCK(&ri->ri_lock);
708 	MUTEX_LOCK(&ri->ri_queue_lock);
709 
710 	if (ri->ri_i.i_primary_ctid >= 1)
711 		contract_hash_remove(ri->ri_i.i_primary_ctid);
712 
713 	while (ri->ri_method_thread != 0 || ri->ri_method_waiters > 0)
714 		(void) pthread_cond_wait(&ri->ri_method_cv, &ri->ri_lock);
715 
716 	while ((e = uu_list_teardown(ri->ri_queue, &cookie)) != NULL)
717 		startd_free(e, sizeof (*e));
718 	uu_list_destroy(ri->ri_queue);
719 
720 	startd_free((void *)ri->ri_i.i_fmri, strlen(ri->ri_i.i_fmri) + 1);
721 	startd_free(ri->ri_logstem, PATH_MAX);
722 	if (ri->ri_common_name != NULL)
723 		startd_free(ri->ri_common_name, max_scf_value_size);
724 	if (ri->ri_C_common_name != NULL)
725 		startd_free(ri->ri_C_common_name, max_scf_value_size);
726 	startd_free(ri->ri_utmpx_prefix, max_scf_value_size);
727 	(void) pthread_mutex_destroy(&ri->ri_lock);
728 	(void) pthread_mutex_destroy(&ri->ri_queue_lock);
729 	startd_free(ri, sizeof (restarter_inst_t));
730 }
731 
732 /*
733  * instance_is_wait_style()
734  *
735  *   Returns 1 if the given instance is a "wait-style" service instance.
736  */
737 int
738 instance_is_wait_style(restarter_inst_t *inst)
739 {
740 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
741 	return ((inst->ri_flags & RINST_STYLE_MASK) == RINST_WAIT);
742 }
743 
744 /*
745  * instance_is_transient_style()
746  *
747  *   Returns 1 if the given instance is a transient service instance.
748  */
749 int
750 instance_is_transient_style(restarter_inst_t *inst)
751 {
752 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
753 	return ((inst->ri_flags & RINST_STYLE_MASK) == RINST_TRANSIENT);
754 }
755 
756 /*
757  * instance_in_transition()
758  * Returns 1 if instance is in transition, 0 if not
759  */
760 int
761 instance_in_transition(restarter_inst_t *inst)
762 {
763 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
764 	if (inst->ri_i.i_next_state == RESTARTER_STATE_NONE)
765 		return (0);
766 	return (1);
767 }
768 
769 /*
770  * returns 1 if instance is already started, 0 if not
771  */
772 static int
773 instance_started(restarter_inst_t *inst)
774 {
775 	int ret;
776 
777 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
778 
779 	if (inst->ri_i.i_state == RESTARTER_STATE_ONLINE ||
780 	    inst->ri_i.i_state == RESTARTER_STATE_DEGRADED)
781 		ret = 1;
782 	else
783 		ret = 0;
784 
785 	return (ret);
786 }
787 
788 /*
789  * Returns
790  *   0 - success
791  *   ECONNRESET - success, but h was rebound
792  */
793 int
794 restarter_instance_update_states(scf_handle_t *h, restarter_inst_t *ri,
795     restarter_instance_state_t new_state,
796     restarter_instance_state_t new_state_next, restarter_error_t err, char *aux)
797 {
798 	protocol_states_t *states;
799 	int e;
800 	uint_t retry_count = 0, msecs = ALLOC_DELAY;
801 	boolean_t rebound = B_FALSE;
802 	int prev_state_online;
803 	int state_online;
804 
805 	assert(PTHREAD_MUTEX_HELD(&ri->ri_lock));
806 
807 	prev_state_online = instance_started(ri);
808 
809 retry:
810 	e = _restarter_commit_states(h, &ri->ri_i, new_state, new_state_next,
811 	    aux);
812 	switch (e) {
813 	case 0:
814 		break;
815 
816 	case ENOMEM:
817 		++retry_count;
818 		if (retry_count < ALLOC_RETRY) {
819 			(void) poll(NULL, 0, msecs);
820 			msecs *= ALLOC_DELAY_MULT;
821 			goto retry;
822 		}
823 
824 		/* Like startd_alloc(). */
825 		uu_die("Insufficient memory.\n");
826 		/* NOTREACHED */
827 
828 	case ECONNABORTED:
829 		libscf_handle_rebind(h);
830 		rebound = B_TRUE;
831 		goto retry;
832 
833 	case EPERM:
834 	case EACCES:
835 	case EROFS:
836 		log_error(LOG_NOTICE, "Could not commit state change for %s "
837 		    "to repository: %s.\n", ri->ri_i.i_fmri, strerror(e));
838 		/* FALLTHROUGH */
839 
840 	case ENOENT:
841 		ri->ri_i.i_state = new_state;
842 		ri->ri_i.i_next_state = new_state_next;
843 		break;
844 
845 	case EINVAL:
846 	default:
847 		bad_error("_restarter_commit_states", e);
848 	}
849 
850 	states = startd_alloc(sizeof (protocol_states_t));
851 	states->ps_state = new_state;
852 	states->ps_state_next = new_state_next;
853 	states->ps_err = err;
854 	graph_protocol_send_event(ri->ri_i.i_fmri, GRAPH_UPDATE_STATE_CHANGE,
855 	    (void *)states);
856 
857 	state_online = instance_started(ri);
858 
859 	if (prev_state_online && !state_online)
860 		ri->ri_post_offline_hook();
861 	else if (!prev_state_online && state_online)
862 		ri->ri_post_online_hook();
863 
864 	return (rebound ? ECONNRESET : 0);
865 }
866 
867 void
868 restarter_mark_pending_snapshot(const char *fmri, uint_t flag)
869 {
870 	restarter_inst_t *inst;
871 
872 	assert(flag == RINST_RETAKE_RUNNING || flag == RINST_RETAKE_START);
873 
874 	inst = inst_lookup_by_name(fmri);
875 	if (inst == NULL)
876 		return;
877 
878 	inst->ri_flags |= flag;
879 
880 	MUTEX_UNLOCK(&inst->ri_lock);
881 }
882 
883 static void
884 restarter_take_pending_snapshots(scf_handle_t *h)
885 {
886 	restarter_inst_t *inst;
887 	int r;
888 
889 	MUTEX_LOCK(&instance_list.ril_lock);
890 
891 	for (inst = uu_list_first(instance_list.ril_instance_list);
892 	    inst != NULL;
893 	    inst = uu_list_next(instance_list.ril_instance_list, inst)) {
894 		const char *fmri;
895 		scf_instance_t *sinst = NULL;
896 
897 		MUTEX_LOCK(&inst->ri_lock);
898 
899 		/*
900 		 * This is where we'd check inst->ri_method_thread and if it
901 		 * were nonzero we'd wait in anticipation of another thread
902 		 * executing a method for inst.  Doing so with the instance_list
903 		 * locked, though, leads to deadlock.  Since taking a snapshot
904 		 * during that window won't hurt anything, we'll just continue.
905 		 */
906 
907 		fmri = inst->ri_i.i_fmri;
908 
909 		if (inst->ri_flags & RINST_RETAKE_RUNNING) {
910 			scf_snapshot_t *rsnap;
911 
912 			(void) libscf_fmri_get_instance(h, fmri, &sinst);
913 
914 			rsnap = libscf_get_or_make_running_snapshot(sinst,
915 			    fmri, B_FALSE);
916 
917 			scf_instance_destroy(sinst);
918 
919 			if (rsnap != NULL)
920 				inst->ri_flags &= ~RINST_RETAKE_RUNNING;
921 
922 			scf_snapshot_destroy(rsnap);
923 		}
924 
925 		if (inst->ri_flags & RINST_RETAKE_START) {
926 			switch (r = libscf_snapshots_poststart(h, fmri,
927 			    B_FALSE)) {
928 			case 0:
929 			case ENOENT:
930 				inst->ri_flags &= ~RINST_RETAKE_START;
931 				break;
932 
933 			case ECONNABORTED:
934 				break;
935 
936 			case EACCES:
937 			default:
938 				bad_error("libscf_snapshots_poststart", r);
939 			}
940 		}
941 
942 		MUTEX_UNLOCK(&inst->ri_lock);
943 	}
944 
945 	MUTEX_UNLOCK(&instance_list.ril_lock);
946 }
947 
948 /* ARGSUSED */
949 void *
950 restarter_post_fsminimal_thread(void *unused)
951 {
952 	scf_handle_t *h;
953 	int r;
954 
955 	h = libscf_handle_create_bound_loop();
956 
957 	for (;;) {
958 		r = libscf_create_self(h);
959 		if (r == 0)
960 			break;
961 
962 		assert(r == ECONNABORTED);
963 		libscf_handle_rebind(h);
964 	}
965 
966 	restarter_take_pending_snapshots(h);
967 
968 	(void) scf_handle_unbind(h);
969 	scf_handle_destroy(h);
970 
971 	return (NULL);
972 }
973 
974 /*
975  * int stop_instance()
976  *
977  *   Stop the instance identified by the instance given as the second argument,
978  *   for the cause stated.
979  *
980  *   Returns
981  *     0 - success
982  *     -1 - inst is in transition
983  */
984 static int
985 stop_instance(scf_handle_t *local_handle, restarter_inst_t *inst,
986     stop_cause_t cause)
987 {
988 	fork_info_t *info;
989 	const char *cp;
990 	int err;
991 	restarter_error_t re;
992 
993 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
994 	assert(inst->ri_method_thread == 0);
995 
996 	switch (cause) {
997 	case RSTOP_EXIT:
998 		re = RERR_RESTART;
999 		cp = "all processes in service exited";
1000 		break;
1001 	case RSTOP_CORE:
1002 		re = RERR_FAULT;
1003 		cp = "process dumped core";
1004 		break;
1005 	case RSTOP_SIGNAL:
1006 		re = RERR_FAULT;
1007 		cp = "process received fatal signal from outside the service";
1008 		break;
1009 	case RSTOP_HWERR:
1010 		re = RERR_FAULT;
1011 		cp = "process killed due to uncorrectable hardware error";
1012 		break;
1013 	case RSTOP_DEPENDENCY:
1014 		re = RERR_RESTART;
1015 		cp = "dependency activity requires stop";
1016 		break;
1017 	case RSTOP_DISABLE:
1018 		re = RERR_RESTART;
1019 		cp = "service disabled";
1020 		break;
1021 	case RSTOP_RESTART:
1022 		re = RERR_RESTART;
1023 		cp = "service restarting";
1024 		break;
1025 	default:
1026 #ifndef NDEBUG
1027 		(void) fprintf(stderr, "Unknown cause %d at %s:%d.\n",
1028 		    cause, __FILE__, __LINE__);
1029 #endif
1030 		abort();
1031 	}
1032 
1033 	/* Services in the disabled and maintenance state are ignored */
1034 	if (inst->ri_i.i_state == RESTARTER_STATE_MAINT ||
1035 	    inst->ri_i.i_state == RESTARTER_STATE_DISABLED) {
1036 		log_framework(LOG_DEBUG,
1037 		    "%s: stop_instance -> is maint/disabled\n",
1038 		    inst->ri_i.i_fmri);
1039 		return (0);
1040 	}
1041 
1042 	/* Already stopped instances are left alone */
1043 	if (instance_started(inst) == 0) {
1044 		log_framework(LOG_DEBUG, "Restarter: %s is already stopped.\n",
1045 		    inst->ri_i.i_fmri);
1046 		return (0);
1047 	}
1048 
1049 	if (instance_in_transition(inst)) {
1050 		/* requeue event by returning -1 */
1051 		log_framework(LOG_DEBUG,
1052 		    "Restarter: Not stopping %s, in transition.\n",
1053 		    inst->ri_i.i_fmri);
1054 		return (-1);
1055 	}
1056 
1057 	log_instance(inst, B_TRUE, "Stopping because %s.", cp);
1058 
1059 	log_framework(re == RERR_FAULT ? LOG_INFO : LOG_DEBUG,
1060 	    "%s: Instance stopping because %s.\n", inst->ri_i.i_fmri, cp);
1061 
1062 	if (instance_is_wait_style(inst) && cause == RSTOP_EXIT) {
1063 		/*
1064 		 * No need to stop instance, as child has exited; remove
1065 		 * contract and move the instance to the offline state.
1066 		 */
1067 		switch (err = restarter_instance_update_states(local_handle,
1068 		    inst, inst->ri_i.i_state, RESTARTER_STATE_OFFLINE, re,
1069 		    NULL)) {
1070 		case 0:
1071 		case ECONNRESET:
1072 			break;
1073 
1074 		default:
1075 			bad_error("restarter_instance_update_states", err);
1076 		}
1077 
1078 		(void) update_fault_count(inst, FAULT_COUNT_RESET);
1079 
1080 		if (inst->ri_i.i_primary_ctid != 0) {
1081 			inst->ri_m_inst =
1082 			    safe_scf_instance_create(local_handle);
1083 			inst->ri_mi_deleted = B_FALSE;
1084 
1085 			libscf_reget_instance(inst);
1086 			method_remove_contract(inst, B_TRUE, B_TRUE);
1087 
1088 			scf_instance_destroy(inst->ri_m_inst);
1089 			inst->ri_m_inst = NULL;
1090 		}
1091 
1092 		switch (err = restarter_instance_update_states(local_handle,
1093 		    inst, inst->ri_i.i_next_state, RESTARTER_STATE_NONE, re,
1094 		    NULL)) {
1095 		case 0:
1096 		case ECONNRESET:
1097 			break;
1098 
1099 		default:
1100 			bad_error("restarter_instance_update_states", err);
1101 		}
1102 
1103 		return (0);
1104 	}
1105 
1106 	switch (err = restarter_instance_update_states(local_handle, inst,
1107 	    inst->ri_i.i_state, inst->ri_i.i_enabled ? RESTARTER_STATE_OFFLINE :
1108 	    RESTARTER_STATE_DISABLED, RERR_NONE, NULL)) {
1109 	case 0:
1110 	case ECONNRESET:
1111 		break;
1112 
1113 	default:
1114 		bad_error("restarter_instance_update_states", err);
1115 	}
1116 
1117 	info = startd_zalloc(sizeof (fork_info_t));
1118 
1119 	info->sf_id = inst->ri_id;
1120 	info->sf_method_type = METHOD_STOP;
1121 	info->sf_event_type = re;
1122 	inst->ri_method_thread = startd_thread_create(method_thread, info);
1123 
1124 	return (0);
1125 }
1126 
1127 /*
1128  * Returns
1129  *   ENOENT - fmri is not in instance_list
1130  *   0 - success
1131  *   ECONNRESET - success, though handle was rebound
1132  *   -1 - instance is in transition
1133  */
1134 int
1135 stop_instance_fmri(scf_handle_t *h, const char *fmri, uint_t flags)
1136 {
1137 	restarter_inst_t *rip;
1138 	int r;
1139 
1140 	rip = inst_lookup_by_name(fmri);
1141 	if (rip == NULL)
1142 		return (ENOENT);
1143 
1144 	r = stop_instance(h, rip, flags);
1145 
1146 	MUTEX_UNLOCK(&rip->ri_lock);
1147 
1148 	return (r);
1149 }
1150 
1151 static void
1152 unmaintain_instance(scf_handle_t *h, restarter_inst_t *rip,
1153     unmaint_cause_t cause)
1154 {
1155 	ctid_t ctid;
1156 	scf_instance_t *inst;
1157 	int r;
1158 	uint_t tries = 0, msecs = ALLOC_DELAY;
1159 	const char *cp;
1160 
1161 	assert(PTHREAD_MUTEX_HELD(&rip->ri_lock));
1162 
1163 	if (rip->ri_i.i_state != RESTARTER_STATE_MAINT) {
1164 		log_error(LOG_DEBUG, "Restarter: "
1165 		    "Ignoring maintenance off command because %s is not in the "
1166 		    "maintenance state.\n", rip->ri_i.i_fmri);
1167 		return;
1168 	}
1169 
1170 	switch (cause) {
1171 	case RUNMAINT_CLEAR:
1172 		cp = "clear requested";
1173 		break;
1174 	case RUNMAINT_DISABLE:
1175 		cp = "disable requested";
1176 		break;
1177 	default:
1178 #ifndef NDEBUG
1179 		(void) fprintf(stderr, "Uncaught case for %d at %s:%d.\n",
1180 		    cause, __FILE__, __LINE__);
1181 #endif
1182 		abort();
1183 	}
1184 
1185 	log_instance(rip, B_TRUE, "Leaving maintenance because %s.",
1186 	    cp);
1187 	log_framework(LOG_DEBUG, "%s: Instance leaving maintenance because "
1188 	    "%s.\n", rip->ri_i.i_fmri, cp);
1189 
1190 	(void) restarter_instance_update_states(h, rip, RESTARTER_STATE_UNINIT,
1191 	    RESTARTER_STATE_NONE, RERR_RESTART, NULL);
1192 
1193 	/*
1194 	 * If we did ADMIN_MAINT_ON_IMMEDIATE, then there might still be
1195 	 * a primary contract.
1196 	 */
1197 	if (rip->ri_i.i_primary_ctid == 0)
1198 		return;
1199 
1200 	ctid = rip->ri_i.i_primary_ctid;
1201 	contract_abandon(ctid);
1202 	rip->ri_i.i_primary_ctid = 0;
1203 
1204 rep_retry:
1205 	switch (r = libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst)) {
1206 	case 0:
1207 		break;
1208 
1209 	case ECONNABORTED:
1210 		libscf_handle_rebind(h);
1211 		goto rep_retry;
1212 
1213 	case ENOENT:
1214 		/* Must have been deleted. */
1215 		return;
1216 
1217 	case EINVAL:
1218 	case ENOTSUP:
1219 	default:
1220 		bad_error("libscf_handle_rebind", r);
1221 	}
1222 
1223 again:
1224 	r = restarter_remove_contract(inst, ctid, RESTARTER_CONTRACT_PRIMARY);
1225 	switch (r) {
1226 	case 0:
1227 		break;
1228 
1229 	case ENOMEM:
1230 		++tries;
1231 		if (tries < ALLOC_RETRY) {
1232 			(void) poll(NULL, 0, msecs);
1233 			msecs *= ALLOC_DELAY_MULT;
1234 			goto again;
1235 		}
1236 
1237 		uu_die("Insufficient memory.\n");
1238 		/* NOTREACHED */
1239 
1240 	case ECONNABORTED:
1241 		scf_instance_destroy(inst);
1242 		libscf_handle_rebind(h);
1243 		goto rep_retry;
1244 
1245 	case ECANCELED:
1246 		break;
1247 
1248 	case EPERM:
1249 	case EACCES:
1250 	case EROFS:
1251 		log_error(LOG_INFO,
1252 		    "Could not remove contract id %lu for %s (%s).\n", ctid,
1253 		    rip->ri_i.i_fmri, strerror(r));
1254 		break;
1255 
1256 	case EINVAL:
1257 	case EBADF:
1258 	default:
1259 		bad_error("restarter_remove_contract", r);
1260 	}
1261 
1262 	scf_instance_destroy(inst);
1263 }
1264 
1265 /*
1266  * enable_inst()
1267  *   Set inst->ri_i.i_enabled.  Expects 'e' to be _ENABLE, _DISABLE, or
1268  *   _ADMIN_DISABLE.  If the event is _ENABLE and inst is uninitialized or
1269  *   disabled, move it to offline.  If the event is _DISABLE or
1270  *   _ADMIN_DISABLE, make sure inst will move to disabled.
1271  *
1272  *   Returns
1273  *     0 - success
1274  *     ECONNRESET - h was rebound
1275  */
1276 static int
1277 enable_inst(scf_handle_t *h, restarter_inst_t *inst, restarter_event_type_t e)
1278 {
1279 	restarter_instance_state_t state;
1280 	int r;
1281 
1282 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
1283 	assert(e == RESTARTER_EVENT_TYPE_ADMIN_DISABLE ||
1284 	    e == RESTARTER_EVENT_TYPE_DISABLE ||
1285 	    e == RESTARTER_EVENT_TYPE_ENABLE);
1286 	assert(instance_in_transition(inst) == 0);
1287 
1288 	state = inst->ri_i.i_state;
1289 
1290 	if (e == RESTARTER_EVENT_TYPE_ENABLE) {
1291 		inst->ri_i.i_enabled = 1;
1292 
1293 		if (state == RESTARTER_STATE_UNINIT ||
1294 		    state == RESTARTER_STATE_DISABLED) {
1295 			/*
1296 			 * B_FALSE: Don't log an error if the log_instance()
1297 			 * fails because it will fail on the miniroot before
1298 			 * install-discovery runs.
1299 			 */
1300 			log_instance(inst, B_FALSE, "Enabled.");
1301 			log_framework(LOG_DEBUG, "%s: Instance enabled.\n",
1302 			    inst->ri_i.i_fmri);
1303 			(void) restarter_instance_update_states(h, inst,
1304 			    RESTARTER_STATE_OFFLINE, RESTARTER_STATE_NONE,
1305 			    RERR_NONE, NULL);
1306 		} else {
1307 			log_framework(LOG_DEBUG, "Restarter: "
1308 			    "Not changing state of %s for enable command.\n",
1309 			    inst->ri_i.i_fmri);
1310 		}
1311 	} else {
1312 		inst->ri_i.i_enabled = 0;
1313 
1314 		switch (state) {
1315 		case RESTARTER_STATE_ONLINE:
1316 		case RESTARTER_STATE_DEGRADED:
1317 			r = stop_instance(h, inst, RSTOP_DISABLE);
1318 			return (r == ECONNRESET ? 0 : r);
1319 
1320 		case RESTARTER_STATE_OFFLINE:
1321 		case RESTARTER_STATE_UNINIT:
1322 			if (inst->ri_i.i_primary_ctid != 0) {
1323 				inst->ri_m_inst = safe_scf_instance_create(h);
1324 				inst->ri_mi_deleted = B_FALSE;
1325 
1326 				libscf_reget_instance(inst);
1327 				method_remove_contract(inst, B_TRUE, B_TRUE);
1328 
1329 				scf_instance_destroy(inst->ri_m_inst);
1330 			}
1331 			/* B_FALSE: See log_instance(..., "Enabled."); above */
1332 			log_instance(inst, B_FALSE, "Disabled.");
1333 			log_framework(LOG_DEBUG, "%s: Instance disabled.\n",
1334 			    inst->ri_i.i_fmri);
1335 			(void) restarter_instance_update_states(h, inst,
1336 			    RESTARTER_STATE_DISABLED, RESTARTER_STATE_NONE,
1337 			    RERR_RESTART, NULL);
1338 			return (0);
1339 
1340 		case RESTARTER_STATE_DISABLED:
1341 			break;
1342 
1343 		case RESTARTER_STATE_MAINT:
1344 			/*
1345 			 * We only want to pull the instance out of maintenance
1346 			 * if the disable is on adminstrative request.  The
1347 			 * graph engine sends _DISABLE events whenever a
1348 			 * service isn't in the disabled state, and we don't
1349 			 * want to pull the service out of maintenance if,
1350 			 * for example, it is there due to a dependency cycle.
1351 			 */
1352 			if (e == RESTARTER_EVENT_TYPE_ADMIN_DISABLE)
1353 				unmaintain_instance(h, inst, RUNMAINT_DISABLE);
1354 			break;
1355 
1356 		default:
1357 #ifndef NDEBUG
1358 			(void) fprintf(stderr, "Restarter instance %s has "
1359 			    "unknown state %d.\n", inst->ri_i.i_fmri, state);
1360 #endif
1361 			abort();
1362 		}
1363 	}
1364 
1365 	return (0);
1366 }
1367 
1368 static void
1369 start_instance(scf_handle_t *local_handle, restarter_inst_t *inst)
1370 {
1371 	fork_info_t *info;
1372 
1373 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
1374 	assert(instance_in_transition(inst) == 0);
1375 	assert(inst->ri_method_thread == 0);
1376 
1377 	log_framework(LOG_DEBUG, "%s: trying to start instance\n",
1378 	    inst->ri_i.i_fmri);
1379 
1380 	/* Services in the disabled and maintenance state are ignored */
1381 	if (inst->ri_i.i_state == RESTARTER_STATE_MAINT ||
1382 	    inst->ri_i.i_state == RESTARTER_STATE_DISABLED ||
1383 	    inst->ri_i.i_enabled == 0) {
1384 		log_framework(LOG_DEBUG,
1385 		    "%s: start_instance -> is maint/disabled\n",
1386 		    inst->ri_i.i_fmri);
1387 		return;
1388 	}
1389 
1390 	/* Already started instances are left alone */
1391 	if (instance_started(inst) == 1) {
1392 		log_framework(LOG_DEBUG,
1393 		    "%s: start_instance -> is already started\n",
1394 		    inst->ri_i.i_fmri);
1395 		return;
1396 	}
1397 
1398 	log_framework(LOG_DEBUG, "%s: starting instance.\n", inst->ri_i.i_fmri);
1399 
1400 	(void) restarter_instance_update_states(local_handle, inst,
1401 	    inst->ri_i.i_state, RESTARTER_STATE_ONLINE, RERR_NONE, NULL);
1402 
1403 	info = startd_zalloc(sizeof (fork_info_t));
1404 
1405 	info->sf_id = inst->ri_id;
1406 	info->sf_method_type = METHOD_START;
1407 	info->sf_event_type = RERR_NONE;
1408 	inst->ri_method_thread = startd_thread_create(method_thread, info);
1409 }
1410 
1411 static void
1412 maintain_instance(scf_handle_t *h, restarter_inst_t *rip, int immediate,
1413     const char *aux)
1414 {
1415 	fork_info_t *info;
1416 
1417 	assert(PTHREAD_MUTEX_HELD(&rip->ri_lock));
1418 	assert(aux != NULL);
1419 	assert(rip->ri_method_thread == 0);
1420 
1421 	log_instance(rip, B_TRUE, "Stopping for maintenance due to %s.", aux);
1422 	log_framework(LOG_DEBUG, "%s: stopping for maintenance due to %s.\n",
1423 	    rip->ri_i.i_fmri, aux);
1424 
1425 	/* Services in the maintenance state are ignored */
1426 	if (rip->ri_i.i_state == RESTARTER_STATE_MAINT) {
1427 		log_framework(LOG_DEBUG,
1428 		    "%s: maintain_instance -> is already in maintenance\n",
1429 		    rip->ri_i.i_fmri);
1430 		return;
1431 	}
1432 
1433 	if (immediate || !instance_started(rip)) {
1434 		if (rip->ri_i.i_primary_ctid != 0) {
1435 			rip->ri_m_inst = safe_scf_instance_create(h);
1436 			rip->ri_mi_deleted = B_FALSE;
1437 
1438 			libscf_reget_instance(rip);
1439 			method_remove_contract(rip, B_TRUE, B_TRUE);
1440 
1441 			scf_instance_destroy(rip->ri_m_inst);
1442 		}
1443 
1444 		(void) restarter_instance_update_states(h, rip,
1445 		    RESTARTER_STATE_MAINT, RESTARTER_STATE_NONE, RERR_RESTART,
1446 		    (char *)aux);
1447 		return;
1448 	}
1449 
1450 	(void) restarter_instance_update_states(h, rip, rip->ri_i.i_state,
1451 	    RESTARTER_STATE_MAINT, RERR_NONE, (char *)aux);
1452 
1453 	log_transition(rip, MAINT_REQUESTED);
1454 
1455 	info = startd_zalloc(sizeof (*info));
1456 	info->sf_id = rip->ri_id;
1457 	info->sf_method_type = METHOD_STOP;
1458 	info->sf_event_type = RERR_RESTART;
1459 	rip->ri_method_thread = startd_thread_create(method_thread, info);
1460 }
1461 
1462 static void
1463 refresh_instance(scf_handle_t *h, restarter_inst_t *rip)
1464 {
1465 	scf_instance_t *inst;
1466 	scf_snapshot_t *snap;
1467 	fork_info_t *info;
1468 	int r;
1469 
1470 	assert(PTHREAD_MUTEX_HELD(&rip->ri_lock));
1471 
1472 	log_instance(rip, B_TRUE, "Rereading configuration.");
1473 	log_framework(LOG_DEBUG, "%s: rereading configuration.\n",
1474 	    rip->ri_i.i_fmri);
1475 
1476 rep_retry:
1477 	r = libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst);
1478 	switch (r) {
1479 	case 0:
1480 		break;
1481 
1482 	case ECONNABORTED:
1483 		libscf_handle_rebind(h);
1484 		goto rep_retry;
1485 
1486 	case ENOENT:
1487 		/* Must have been deleted. */
1488 		return;
1489 
1490 	case EINVAL:
1491 	case ENOTSUP:
1492 	default:
1493 		bad_error("libscf_fmri_get_instance", r);
1494 	}
1495 
1496 	snap = libscf_get_running_snapshot(inst);
1497 
1498 	r = libscf_get_startd_properties(inst, snap, &rip->ri_flags,
1499 	    &rip->ri_utmpx_prefix);
1500 	switch (r) {
1501 	case 0:
1502 		log_framework(LOG_DEBUG, "%s is a %s-style service\n",
1503 		    rip->ri_i.i_fmri, service_style(rip->ri_flags));
1504 		break;
1505 
1506 	case ECONNABORTED:
1507 		scf_instance_destroy(inst);
1508 		scf_snapshot_destroy(snap);
1509 		libscf_handle_rebind(h);
1510 		goto rep_retry;
1511 
1512 	case ECANCELED:
1513 	case ENOENT:
1514 		/* Succeed in anticipation of REMOVE_INSTANCE. */
1515 		break;
1516 
1517 	default:
1518 		bad_error("libscf_get_startd_properties", r);
1519 	}
1520 
1521 	if (instance_started(rip)) {
1522 		/* Refresh does not change the state. */
1523 		(void) restarter_instance_update_states(h, rip,
1524 		    rip->ri_i.i_state, rip->ri_i.i_state, RERR_NONE, NULL);
1525 
1526 		info = startd_zalloc(sizeof (*info));
1527 		info->sf_id = rip->ri_id;
1528 		info->sf_method_type = METHOD_REFRESH;
1529 		info->sf_event_type = RERR_REFRESH;
1530 
1531 		assert(rip->ri_method_thread == 0);
1532 		rip->ri_method_thread =
1533 		    startd_thread_create(method_thread, info);
1534 	}
1535 
1536 	scf_snapshot_destroy(snap);
1537 	scf_instance_destroy(inst);
1538 }
1539 
1540 const char *event_names[] = { "INVALID", "ADD_INSTANCE", "REMOVE_INSTANCE",
1541 	"ENABLE", "DISABLE", "ADMIN_DEGRADED", "ADMIN_REFRESH",
1542 	"ADMIN_RESTART", "ADMIN_MAINT_OFF", "ADMIN_MAINT_ON",
1543 	"ADMIN_MAINT_ON_IMMEDIATE", "STOP", "START", "DEPENDENCY_CYCLE",
1544 	"INVALID_DEPENDENCY", "ADMIN_DISABLE"
1545 };
1546 
1547 /*
1548  * void *restarter_process_events()
1549  *
1550  *   Called in a separate thread to process the events on an instance's
1551  *   queue.  Empties the queue completely, and tries to keep the thread
1552  *   around for a little while after the queue is empty to save on
1553  *   startup costs.
1554  */
1555 static void *
1556 restarter_process_events(void *arg)
1557 {
1558 	scf_handle_t *h;
1559 	restarter_instance_qentry_t *event;
1560 	restarter_inst_t *rip;
1561 	char *fmri = (char *)arg;
1562 	struct timespec to;
1563 
1564 	assert(fmri != NULL);
1565 
1566 	h = libscf_handle_create_bound_loop();
1567 
1568 	/* grab the queue lock */
1569 	rip = inst_lookup_queue(fmri);
1570 	if (rip == NULL)
1571 		goto out;
1572 
1573 again:
1574 
1575 	while ((event = uu_list_first(rip->ri_queue)) != NULL) {
1576 		restarter_inst_t *inst;
1577 
1578 		/* drop the queue lock */
1579 		MUTEX_UNLOCK(&rip->ri_queue_lock);
1580 
1581 		/*
1582 		 * Grab the inst lock -- this waits until any outstanding
1583 		 * method finishes running.
1584 		 */
1585 		inst = inst_lookup_by_name(fmri);
1586 		if (inst == NULL) {
1587 			/* Getting deleted in the middle isn't an error. */
1588 			goto cont;
1589 		}
1590 
1591 		assert(instance_in_transition(inst) == 0);
1592 
1593 		/* process the event */
1594 		switch (event->riq_type) {
1595 		case RESTARTER_EVENT_TYPE_ENABLE:
1596 		case RESTARTER_EVENT_TYPE_DISABLE:
1597 		case RESTARTER_EVENT_TYPE_ADMIN_DISABLE:
1598 			(void) enable_inst(h, inst, event->riq_type);
1599 			break;
1600 
1601 		case RESTARTER_EVENT_TYPE_REMOVE_INSTANCE:
1602 			restarter_delete_inst(inst);
1603 			inst = NULL;
1604 			goto cont;
1605 
1606 		case RESTARTER_EVENT_TYPE_STOP:
1607 			(void) stop_instance(h, inst, RSTOP_DEPENDENCY);
1608 			break;
1609 
1610 		case RESTARTER_EVENT_TYPE_START:
1611 			start_instance(h, inst);
1612 			break;
1613 
1614 		case RESTARTER_EVENT_TYPE_DEPENDENCY_CYCLE:
1615 			maintain_instance(h, inst, 0, "dependency_cycle");
1616 			break;
1617 
1618 		case RESTARTER_EVENT_TYPE_INVALID_DEPENDENCY:
1619 			maintain_instance(h, inst, 0, "invalid_dependency");
1620 			break;
1621 
1622 		case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON:
1623 			maintain_instance(h, inst, 0, "administrative_request");
1624 			break;
1625 
1626 		case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON_IMMEDIATE:
1627 			maintain_instance(h, inst, 1, "administrative_request");
1628 			break;
1629 
1630 		case RESTARTER_EVENT_TYPE_ADMIN_MAINT_OFF:
1631 			unmaintain_instance(h, inst, RUNMAINT_CLEAR);
1632 			break;
1633 
1634 		case RESTARTER_EVENT_TYPE_ADMIN_REFRESH:
1635 			refresh_instance(h, inst);
1636 			break;
1637 
1638 		case RESTARTER_EVENT_TYPE_ADMIN_DEGRADED:
1639 			log_framework(LOG_WARNING, "Restarter: "
1640 			    "%s command (for %s) unimplemented.\n",
1641 			    event_names[event->riq_type], inst->ri_i.i_fmri);
1642 			break;
1643 
1644 		case RESTARTER_EVENT_TYPE_ADMIN_RESTART:
1645 			if (!instance_started(inst)) {
1646 				log_framework(LOG_DEBUG, "Restarter: "
1647 				    "Not restarting %s; not running.\n",
1648 				    inst->ri_i.i_fmri);
1649 			} else {
1650 				/*
1651 				 * Stop the instance.  If it can be restarted,
1652 				 * the graph engine will send a new event.
1653 				 */
1654 				(void) stop_instance(h, inst, RSTOP_RESTART);
1655 			}
1656 			break;
1657 
1658 		case RESTARTER_EVENT_TYPE_ADD_INSTANCE:
1659 		default:
1660 #ifndef NDEBUG
1661 			uu_warn("%s:%d: Bad restarter event %d.  "
1662 			    "Aborting.\n", __FILE__, __LINE__, event->riq_type);
1663 #endif
1664 			abort();
1665 		}
1666 
1667 		assert(inst != NULL);
1668 		MUTEX_UNLOCK(&inst->ri_lock);
1669 
1670 cont:
1671 		/* grab the queue lock */
1672 		rip = inst_lookup_queue(fmri);
1673 		if (rip == NULL)
1674 			goto out;
1675 
1676 		/* delete the event */
1677 		uu_list_remove(rip->ri_queue, event);
1678 		startd_free(event, sizeof (restarter_instance_qentry_t));
1679 	}
1680 
1681 	assert(rip != NULL);
1682 
1683 	/*
1684 	 * Try to preserve the thread for a little while for future use.
1685 	 */
1686 	to.tv_sec = 3;
1687 	to.tv_nsec = 0;
1688 	(void) pthread_cond_reltimedwait_np(&rip->ri_queue_cv,
1689 	    &rip->ri_queue_lock, &to);
1690 
1691 	if (uu_list_first(rip->ri_queue) != NULL)
1692 		goto again;
1693 
1694 	rip->ri_queue_thread = 0;
1695 	MUTEX_UNLOCK(&rip->ri_queue_lock);
1696 out:
1697 	(void) scf_handle_unbind(h);
1698 	scf_handle_destroy(h);
1699 	free(fmri);
1700 	return (NULL);
1701 }
1702 
1703 static int
1704 is_admin_event(restarter_event_type_t t) {
1705 
1706 	switch (t) {
1707 	case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON:
1708 	case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON_IMMEDIATE:
1709 	case RESTARTER_EVENT_TYPE_ADMIN_MAINT_OFF:
1710 	case RESTARTER_EVENT_TYPE_ADMIN_REFRESH:
1711 	case RESTARTER_EVENT_TYPE_ADMIN_DEGRADED:
1712 	case RESTARTER_EVENT_TYPE_ADMIN_RESTART:
1713 		return (1);
1714 	default:
1715 		return (0);
1716 	}
1717 }
1718 
1719 static void
1720 restarter_queue_event(restarter_inst_t *ri, restarter_protocol_event_t *e)
1721 {
1722 	restarter_instance_qentry_t *qe;
1723 	int r;
1724 
1725 	assert(PTHREAD_MUTEX_HELD(&ri->ri_queue_lock));
1726 	assert(!PTHREAD_MUTEX_HELD(&ri->ri_lock));
1727 
1728 	qe = startd_zalloc(sizeof (restarter_instance_qentry_t));
1729 	qe->riq_type = e->rpe_type;
1730 
1731 	uu_list_node_init(qe, &qe->riq_link, restarter_queue_pool);
1732 	r = uu_list_insert_before(ri->ri_queue, NULL, qe);
1733 	assert(r == 0);
1734 }
1735 
1736 /*
1737  * void *restarter_event_thread()
1738  *
1739  *  Handle incoming graph events by placing them on a per-instance
1740  *  queue.  We can't lock the main part of the instance structure, so
1741  *  just modify the seprarately locked event queue portion.
1742  */
1743 /*ARGSUSED*/
1744 static void *
1745 restarter_event_thread(void *unused)
1746 {
1747 	scf_handle_t *h;
1748 
1749 	/*
1750 	 * This is a new thread, and thus, gets its own handle
1751 	 * to the repository.
1752 	 */
1753 	h = libscf_handle_create_bound_loop();
1754 
1755 	MUTEX_LOCK(&ru->restarter_update_lock);
1756 
1757 	/*CONSTCOND*/
1758 	while (1) {
1759 		restarter_protocol_event_t *e;
1760 
1761 		while (ru->restarter_update_wakeup == 0)
1762 			(void) pthread_cond_wait(&ru->restarter_update_cv,
1763 			    &ru->restarter_update_lock);
1764 
1765 		ru->restarter_update_wakeup = 0;
1766 
1767 		while ((e = restarter_event_dequeue()) != NULL) {
1768 			restarter_inst_t *rip;
1769 			char *fmri;
1770 
1771 			MUTEX_UNLOCK(&ru->restarter_update_lock);
1772 
1773 			/*
1774 			 * ADD_INSTANCE is special: there's likely no
1775 			 * instance structure yet, so we need to handle the
1776 			 * addition synchronously.
1777 			 */
1778 			switch (e->rpe_type) {
1779 			case RESTARTER_EVENT_TYPE_ADD_INSTANCE:
1780 				if (restarter_insert_inst(h, e->rpe_inst) != 0)
1781 					log_error(LOG_INFO, "Restarter: "
1782 					    "Could not add %s.\n", e->rpe_inst);
1783 
1784 				MUTEX_LOCK(&st->st_load_lock);
1785 				if (--st->st_load_instances == 0)
1786 					(void) pthread_cond_broadcast(
1787 					    &st->st_load_cv);
1788 				MUTEX_UNLOCK(&st->st_load_lock);
1789 
1790 				goto nolookup;
1791 			}
1792 
1793 			/*
1794 			 * Lookup the instance, locking only the event queue.
1795 			 * Can't grab ri_lock here because it might be held
1796 			 * by a long-running method.
1797 			 */
1798 			rip = inst_lookup_queue(e->rpe_inst);
1799 			if (rip == NULL) {
1800 				log_error(LOG_INFO, "Restarter: "
1801 				    "Ignoring %s command for unknown service "
1802 				    "%s.\n", event_names[e->rpe_type],
1803 				    e->rpe_inst);
1804 				goto nolookup;
1805 			}
1806 
1807 			/* Keep ADMIN events from filling up the queue. */
1808 			if (is_admin_event(e->rpe_type) &&
1809 			    uu_list_numnodes(rip->ri_queue) >
1810 			    RINST_QUEUE_THRESHOLD) {
1811 				MUTEX_UNLOCK(&rip->ri_queue_lock);
1812 				log_instance(rip, B_TRUE, "Instance event "
1813 				    "queue overflow.  Dropping administrative "
1814 				    "request.");
1815 				log_framework(LOG_DEBUG, "%s: Instance event "
1816 				    "queue overflow.  Dropping administrative "
1817 				    "request.\n", rip->ri_i.i_fmri);
1818 				goto nolookup;
1819 			}
1820 
1821 			/* Now add the event to the instance queue. */
1822 			restarter_queue_event(rip, e);
1823 
1824 			if (rip->ri_queue_thread == 0) {
1825 				/*
1826 				 * Start a thread if one isn't already
1827 				 * running.
1828 				 */
1829 				fmri = safe_strdup(e->rpe_inst);
1830 				rip->ri_queue_thread =  startd_thread_create(
1831 				    restarter_process_events, (void *)fmri);
1832 			} else {
1833 				/*
1834 				 * Signal the existing thread that there's
1835 				 * a new event.
1836 				 */
1837 				(void) pthread_cond_broadcast(
1838 				    &rip->ri_queue_cv);
1839 			}
1840 
1841 			MUTEX_UNLOCK(&rip->ri_queue_lock);
1842 nolookup:
1843 			restarter_event_release(e);
1844 
1845 			MUTEX_LOCK(&ru->restarter_update_lock);
1846 		}
1847 	}
1848 
1849 	/*
1850 	 * Unreachable for now -- there's currently no graceful cleanup
1851 	 * called on exit().
1852 	 */
1853 	(void) scf_handle_unbind(h);
1854 	scf_handle_destroy(h);
1855 	return (NULL);
1856 }
1857 
1858 static restarter_inst_t *
1859 contract_to_inst(ctid_t ctid)
1860 {
1861 	restarter_inst_t *inst;
1862 	int id;
1863 
1864 	id = lookup_inst_by_contract(ctid);
1865 	if (id == -1)
1866 		return (NULL);
1867 
1868 	inst = inst_lookup_by_id(id);
1869 	if (inst != NULL) {
1870 		/*
1871 		 * Since ri_lock isn't held by the contract id lookup, this
1872 		 * instance may have been restarted and now be in a new
1873 		 * contract, making the old contract no longer valid for this
1874 		 * instance.
1875 		 */
1876 		if (ctid != inst->ri_i.i_primary_ctid) {
1877 			MUTEX_UNLOCK(&inst->ri_lock);
1878 			inst = NULL;
1879 		}
1880 	}
1881 	return (inst);
1882 }
1883 
1884 /*
1885  * void contract_action()
1886  *   Take action on contract events.
1887  */
1888 static void
1889 contract_action(scf_handle_t *h, restarter_inst_t *inst, ctid_t id,
1890     uint32_t type)
1891 {
1892 	const char *fmri = inst->ri_i.i_fmri;
1893 
1894 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
1895 
1896 	/*
1897 	 * If startd has stopped this contract, there is no need to
1898 	 * stop it again.
1899 	 */
1900 	if (inst->ri_i.i_primary_ctid > 0 &&
1901 	    inst->ri_i.i_primary_ctid_stopped)
1902 		return;
1903 
1904 	if ((type & (CT_PR_EV_EMPTY | CT_PR_EV_CORE | CT_PR_EV_SIGNAL
1905 	    | CT_PR_EV_HWERR)) == 0) {
1906 		/*
1907 		 * There shouldn't be other events, since that's not how we set
1908 		 * the terms. Thus, just log an error and drive on.
1909 		 */
1910 		log_framework(LOG_NOTICE,
1911 		    "%s: contract %ld received unexpected critical event "
1912 		    "(%d)\n", fmri, id, type);
1913 		return;
1914 	}
1915 
1916 	assert(instance_in_transition(inst) == 0);
1917 
1918 	if (instance_is_wait_style(inst)) {
1919 		/*
1920 		 * We ignore all events; if they impact the
1921 		 * process we're monitoring, then the
1922 		 * wait_thread will stop the instance.
1923 		 */
1924 		log_framework(LOG_DEBUG,
1925 		    "%s: ignoring contract event on wait-style service\n",
1926 		    fmri);
1927 	} else {
1928 		/*
1929 		 * A CT_PR_EV_EMPTY event is an RSTOP_EXIT request.
1930 		 */
1931 		switch (type) {
1932 		case CT_PR_EV_EMPTY:
1933 			(void) stop_instance(h, inst, RSTOP_EXIT);
1934 			break;
1935 		case CT_PR_EV_CORE:
1936 			(void) stop_instance(h, inst, RSTOP_CORE);
1937 			break;
1938 		case CT_PR_EV_SIGNAL:
1939 			(void) stop_instance(h, inst, RSTOP_SIGNAL);
1940 			break;
1941 		case CT_PR_EV_HWERR:
1942 			(void) stop_instance(h, inst, RSTOP_HWERR);
1943 			break;
1944 		}
1945 	}
1946 }
1947 
1948 /*
1949  * void *restarter_contract_event_thread(void *)
1950  *   Listens to the process contract bundle for critical events, taking action
1951  *   on events from contracts we know we are responsible for.
1952  */
1953 /*ARGSUSED*/
1954 static void *
1955 restarter_contracts_event_thread(void *unused)
1956 {
1957 	int fd, err;
1958 	scf_handle_t *local_handle;
1959 
1960 	/*
1961 	 * Await graph load completion.  That is, stop here, until we've scanned
1962 	 * the repository for contract - instance associations.
1963 	 */
1964 	MUTEX_LOCK(&st->st_load_lock);
1965 	while (!(st->st_load_complete && st->st_load_instances == 0))
1966 		(void) pthread_cond_wait(&st->st_load_cv, &st->st_load_lock);
1967 	MUTEX_UNLOCK(&st->st_load_lock);
1968 
1969 	/*
1970 	 * This is a new thread, and thus, gets its own handle
1971 	 * to the repository.
1972 	 */
1973 	if ((local_handle = libscf_handle_create_bound(SCF_VERSION)) == NULL)
1974 		uu_die("Unable to bind a new repository handle: %s\n",
1975 		    scf_strerror(scf_error()));
1976 
1977 	fd = open64(CTFS_ROOT "/process/pbundle", O_RDONLY);
1978 	if (fd == -1)
1979 		uu_die("process bundle open failed");
1980 
1981 	/*
1982 	 * Make sure we get all events (including those generated by configd
1983 	 * before this thread was started).
1984 	 */
1985 	err = ct_event_reset(fd);
1986 	assert(err == 0);
1987 
1988 	for (;;) {
1989 		int efd, sfd;
1990 		ct_evthdl_t ev;
1991 		uint32_t type;
1992 		ctevid_t evid;
1993 		ct_stathdl_t status;
1994 		ctid_t ctid;
1995 		restarter_inst_t *inst;
1996 		uint64_t cookie;
1997 
1998 		if (err = ct_event_read_critical(fd, &ev)) {
1999 			log_error(LOG_WARNING,
2000 			    "Error reading next contract event: %s",
2001 			    strerror(err));
2002 			continue;
2003 		}
2004 
2005 		evid = ct_event_get_evid(ev);
2006 		ctid = ct_event_get_ctid(ev);
2007 		type = ct_event_get_type(ev);
2008 
2009 		/* Fetch cookie. */
2010 		if ((sfd = contract_open(ctid, "process", "status", O_RDONLY))
2011 		    < 0) {
2012 			ct_event_free(ev);
2013 			continue;
2014 		}
2015 
2016 		if (err = ct_status_read(sfd, CTD_COMMON, &status)) {
2017 			log_framework(LOG_WARNING, "Could not get status for "
2018 			    "contract %ld: %s\n", ctid, strerror(err));
2019 
2020 			startd_close(sfd);
2021 			ct_event_free(ev);
2022 			continue;
2023 		}
2024 
2025 		cookie = ct_status_get_cookie(status);
2026 
2027 		log_framework(LOG_DEBUG, "Received event %d for ctid %ld "
2028 		    "cookie %lld\n", type, ctid, cookie);
2029 
2030 		ct_status_free(status);
2031 
2032 		startd_close(sfd);
2033 
2034 		/*
2035 		 * svc.configd(1M) restart handling performed by the
2036 		 * fork_configd_thread.  We don't acknowledge, as that thread
2037 		 * will do so.
2038 		 */
2039 		if (cookie == CONFIGD_COOKIE) {
2040 			ct_event_free(ev);
2041 			continue;
2042 		}
2043 
2044 		inst = NULL;
2045 		if (storing_contract != 0 &&
2046 		    (inst = contract_to_inst(ctid)) == NULL) {
2047 			/*
2048 			 * This can happen for two reasons:
2049 			 * - method_run() has not yet stored the
2050 			 *    the contract into the internal hash table.
2051 			 * - we receive an EMPTY event for an abandoned
2052 			 *    contract.
2053 			 * If there is any contract in the process of
2054 			 * being stored into the hash table then re-read
2055 			 * the event later.
2056 			 */
2057 			log_framework(LOG_DEBUG,
2058 			    "Reset event %d for unknown "
2059 			    "contract id %ld\n", type, ctid);
2060 
2061 			/* don't go too fast */
2062 			(void) poll(NULL, 0, 100);
2063 
2064 			(void) ct_event_reset(fd);
2065 			ct_event_free(ev);
2066 			continue;
2067 		}
2068 
2069 		/*
2070 		 * Do not call contract_to_inst() again if first
2071 		 * call succeeded.
2072 		 */
2073 		if (inst == NULL)
2074 			inst = contract_to_inst(ctid);
2075 		if (inst == NULL) {
2076 			/*
2077 			 * This can happen if we receive an EMPTY
2078 			 * event for an abandoned contract.
2079 			 */
2080 			log_framework(LOG_DEBUG,
2081 			    "Received event %d for unknown contract id "
2082 			    "%ld\n", type, ctid);
2083 		} else {
2084 			log_framework(LOG_DEBUG,
2085 			    "Received event %d for contract id "
2086 			    "%ld (%s)\n", type, ctid,
2087 			    inst->ri_i.i_fmri);
2088 
2089 			contract_action(local_handle, inst, ctid, type);
2090 
2091 			MUTEX_UNLOCK(&inst->ri_lock);
2092 		}
2093 
2094 		efd = contract_open(ct_event_get_ctid(ev), "process", "ctl",
2095 		    O_WRONLY);
2096 		if (efd != -1) {
2097 			(void) ct_ctl_ack(efd, evid);
2098 			startd_close(efd);
2099 		}
2100 
2101 		ct_event_free(ev);
2102 
2103 	}
2104 
2105 	/*NOTREACHED*/
2106 	return (NULL);
2107 }
2108 
2109 /*
2110  * Timeout queue, processed by restarter_timeouts_event_thread().
2111  */
2112 timeout_queue_t *timeouts;
2113 static uu_list_pool_t *timeout_pool;
2114 
2115 typedef struct timeout_update {
2116 	pthread_mutex_t		tu_lock;
2117 	pthread_cond_t		tu_cv;
2118 	int			tu_wakeup;
2119 } timeout_update_t;
2120 
2121 timeout_update_t *tu;
2122 
2123 static const char *timeout_ovr_svcs[] = {
2124 	"svc:/system/manifest-import:default",
2125 	"svc:/network/initial:default",
2126 	"svc:/network/service:default",
2127 	"svc:/system/rmtmpfiles:default",
2128 	"svc:/network/loopback:default",
2129 	"svc:/network/physical:default",
2130 	"svc:/system/device/local:default",
2131 	"svc:/system/metainit:default",
2132 	"svc:/system/filesystem/usr:default",
2133 	"svc:/system/filesystem/minimal:default",
2134 	"svc:/system/filesystem/local:default",
2135 	NULL
2136 };
2137 
2138 int
2139 is_timeout_ovr(restarter_inst_t *inst)
2140 {
2141 	int i;
2142 
2143 	for (i = 0; timeout_ovr_svcs[i] != NULL; ++i) {
2144 		if (strcmp(inst->ri_i.i_fmri, timeout_ovr_svcs[i]) == 0) {
2145 			log_instance(inst, B_TRUE, "Timeout override by "
2146 			    "svc.startd.  Using infinite timeout.");
2147 			return (1);
2148 		}
2149 	}
2150 
2151 	return (0);
2152 }
2153 
2154 /*ARGSUSED*/
2155 static int
2156 timeout_compare(const void *lc_arg, const void *rc_arg, void *private)
2157 {
2158 	hrtime_t t1 = ((const timeout_entry_t *)lc_arg)->te_timeout;
2159 	hrtime_t t2 = ((const timeout_entry_t *)rc_arg)->te_timeout;
2160 
2161 	if (t1 > t2)
2162 		return (1);
2163 	else if (t1 < t2)
2164 		return (-1);
2165 	return (0);
2166 }
2167 
2168 void
2169 timeout_init()
2170 {
2171 	timeouts = startd_zalloc(sizeof (timeout_queue_t));
2172 
2173 	(void) pthread_mutex_init(&timeouts->tq_lock, &mutex_attrs);
2174 
2175 	timeout_pool = startd_list_pool_create("timeouts",
2176 	    sizeof (timeout_entry_t), offsetof(timeout_entry_t, te_link),
2177 	    timeout_compare, UU_LIST_POOL_DEBUG);
2178 	assert(timeout_pool != NULL);
2179 
2180 	timeouts->tq_list = startd_list_create(timeout_pool,
2181 	    timeouts, UU_LIST_SORTED);
2182 	assert(timeouts->tq_list != NULL);
2183 
2184 	tu = startd_zalloc(sizeof (timeout_update_t));
2185 	(void) pthread_cond_init(&tu->tu_cv, NULL);
2186 	(void) pthread_mutex_init(&tu->tu_lock, &mutex_attrs);
2187 }
2188 
2189 void
2190 timeout_insert(restarter_inst_t *inst, ctid_t cid, uint64_t timeout_sec)
2191 {
2192 	hrtime_t now, timeout;
2193 	timeout_entry_t *entry;
2194 	uu_list_index_t idx;
2195 
2196 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
2197 
2198 	now = gethrtime();
2199 
2200 	/*
2201 	 * If we overflow LLONG_MAX, we're never timing out anyways, so
2202 	 * just return.
2203 	 */
2204 	if (timeout_sec >= (LLONG_MAX - now) / 1000000000LL) {
2205 		log_instance(inst, B_TRUE, "timeout_seconds too large, "
2206 		    "treating as infinite.");
2207 		return;
2208 	}
2209 
2210 	/* hrtime is in nanoseconds. Convert timeout_sec. */
2211 	timeout = now + (timeout_sec * 1000000000LL);
2212 
2213 	entry = startd_alloc(sizeof (timeout_entry_t));
2214 	entry->te_timeout = timeout;
2215 	entry->te_ctid = cid;
2216 	entry->te_fmri = safe_strdup(inst->ri_i.i_fmri);
2217 	entry->te_logstem = safe_strdup(inst->ri_logstem);
2218 	entry->te_fired = 0;
2219 	/* Insert the calculated timeout time onto the queue. */
2220 	MUTEX_LOCK(&timeouts->tq_lock);
2221 	(void) uu_list_find(timeouts->tq_list, entry, NULL, &idx);
2222 	uu_list_node_init(entry, &entry->te_link, timeout_pool);
2223 	uu_list_insert(timeouts->tq_list, entry, idx);
2224 	MUTEX_UNLOCK(&timeouts->tq_lock);
2225 
2226 	assert(inst->ri_timeout == NULL);
2227 	inst->ri_timeout = entry;
2228 
2229 	MUTEX_LOCK(&tu->tu_lock);
2230 	tu->tu_wakeup = 1;
2231 	(void) pthread_cond_broadcast(&tu->tu_cv);
2232 	MUTEX_UNLOCK(&tu->tu_lock);
2233 }
2234 
2235 
2236 void
2237 timeout_remove(restarter_inst_t *inst, ctid_t cid)
2238 {
2239 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
2240 
2241 	if (inst->ri_timeout == NULL)
2242 		return;
2243 
2244 	assert(inst->ri_timeout->te_ctid == cid);
2245 
2246 	MUTEX_LOCK(&timeouts->tq_lock);
2247 	uu_list_remove(timeouts->tq_list, inst->ri_timeout);
2248 	MUTEX_UNLOCK(&timeouts->tq_lock);
2249 
2250 	free(inst->ri_timeout->te_fmri);
2251 	free(inst->ri_timeout->te_logstem);
2252 	startd_free(inst->ri_timeout, sizeof (timeout_entry_t));
2253 	inst->ri_timeout = NULL;
2254 }
2255 
2256 static int
2257 timeout_now()
2258 {
2259 	timeout_entry_t *e;
2260 	hrtime_t now;
2261 	int ret;
2262 
2263 	now = gethrtime();
2264 
2265 	/*
2266 	 * Walk through the (sorted) timeouts list.  While the timeout
2267 	 * at the head of the list is <= the current time, kill the
2268 	 * method.
2269 	 */
2270 	MUTEX_LOCK(&timeouts->tq_lock);
2271 
2272 	for (e = uu_list_first(timeouts->tq_list);
2273 	    e != NULL && e->te_timeout <= now;
2274 	    e = uu_list_next(timeouts->tq_list, e)) {
2275 		log_framework(LOG_WARNING, "%s: Method or service exit timed "
2276 		    "out.  Killing contract %ld.\n", e->te_fmri, e->te_ctid);
2277 		log_instance_fmri(e->te_fmri, e->te_logstem, B_TRUE,
2278 		    "Method or service exit timed out.  Killing contract %ld.",
2279 		    e->te_ctid);
2280 		e->te_fired = 1;
2281 		(void) contract_kill(e->te_ctid, SIGKILL, e->te_fmri);
2282 	}
2283 
2284 	if (uu_list_numnodes(timeouts->tq_list) > 0)
2285 		ret = 0;
2286 	else
2287 		ret = -1;
2288 
2289 	MUTEX_UNLOCK(&timeouts->tq_lock);
2290 
2291 	return (ret);
2292 }
2293 
2294 /*
2295  * void *restarter_timeouts_event_thread(void *)
2296  *   Responsible for monitoring the method timeouts.  This thread must
2297  *   be started before any methods are called.
2298  */
2299 /*ARGSUSED*/
2300 static void *
2301 restarter_timeouts_event_thread(void *unused)
2302 {
2303 	/*
2304 	 * Timeouts are entered on a priority queue, which is processed by
2305 	 * this thread.  As timeouts are specified in seconds, we'll do
2306 	 * the necessary processing every second, as long as the queue
2307 	 * is not empty.
2308 	 */
2309 
2310 	/*CONSTCOND*/
2311 	while (1) {
2312 		/*
2313 		 * As long as the timeout list isn't empty, process it
2314 		 * every second.
2315 		 */
2316 		if (timeout_now() == 0) {
2317 			(void) sleep(1);
2318 			continue;
2319 		}
2320 
2321 		/* The list is empty, wait until we have more timeouts. */
2322 		MUTEX_LOCK(&tu->tu_lock);
2323 
2324 		while (tu->tu_wakeup == 0)
2325 			(void) pthread_cond_wait(&tu->tu_cv, &tu->tu_lock);
2326 
2327 		tu->tu_wakeup = 0;
2328 		MUTEX_UNLOCK(&tu->tu_lock);
2329 	}
2330 
2331 	return (NULL);
2332 }
2333 
2334 void
2335 restarter_start()
2336 {
2337 	(void) startd_thread_create(restarter_timeouts_event_thread, NULL);
2338 	(void) startd_thread_create(restarter_event_thread, NULL);
2339 	(void) startd_thread_create(restarter_contracts_event_thread, NULL);
2340 	(void) startd_thread_create(wait_thread, NULL);
2341 }
2342 
2343 
2344 void
2345 restarter_init()
2346 {
2347 	restarter_instance_pool = startd_list_pool_create("restarter_instances",
2348 	    sizeof (restarter_inst_t), offsetof(restarter_inst_t,
2349 	    ri_link), restarter_instance_compare, UU_LIST_POOL_DEBUG);
2350 	(void) memset(&instance_list, 0, sizeof (instance_list));
2351 
2352 	(void) pthread_mutex_init(&instance_list.ril_lock, &mutex_attrs);
2353 	instance_list.ril_instance_list = startd_list_create(
2354 	    restarter_instance_pool, &instance_list, UU_LIST_SORTED);
2355 
2356 	restarter_queue_pool = startd_list_pool_create(
2357 	    "restarter_instance_queue", sizeof (restarter_instance_qentry_t),
2358 	    offsetof(restarter_instance_qentry_t,  riq_link), NULL,
2359 	    UU_LIST_POOL_DEBUG);
2360 
2361 	contract_list_pool = startd_list_pool_create(
2362 	    "contract_list", sizeof (contract_entry_t),
2363 	    offsetof(contract_entry_t,  ce_link), NULL,
2364 	    UU_LIST_POOL_DEBUG);
2365 	contract_hash_init();
2366 
2367 	log_framework(LOG_DEBUG, "Initialized restarter\n");
2368 }
2369