xref: /titanic_50/usr/src/cmd/svc/startd/restarter.c (revision 62a24de03df1f2399ceda704cb3874dabc98bbbd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * restarter.c - service manipulation
30  *
31  * This component manages services whose restarter is svc.startd, the standard
32  * restarter.  It translates restarter protocol events from the graph engine
33  * into actions on processes, as a delegated restarter would do.
34  *
35  * The master restarter manages a number of always-running threads:
36  *   - restarter event thread: events from the graph engine
37  *   - timeout thread: thread to fire queued timeouts
38  *   - contract thread: thread to handle contract events
39  *   - wait thread: thread to handle wait-based services
40  *
41  * The other threads are created as-needed:
42  *   - per-instance method threads
43  *   - per-instance event processing threads
44  *
45  * The interaction of all threads must result in the following conditions
46  * being satisfied (on a per-instance basis):
47  *   - restarter events must be processed in order
48  *   - method execution must be serialized
49  *   - instance delete must be held until outstanding methods are complete
50  *   - contract events shouldn't be processed while a method is running
51  *   - timeouts should fire even when a method is running
52  *
53  * Service instances are represented by restarter_inst_t's and are kept in the
54  * instance_list list.
55  *
56  * Service States
57  *   The current state of a service instance is kept in
58  *   restarter_inst_t->ri_i.i_state.  If transition to a new state could take
59  *   some time, then before we effect the transition we set
60  *   restarter_inst_t->ri_i.i_next_state to the target state, and afterwards we
61  *   rotate i_next_state to i_state and set i_next_state to
62  *   RESTARTER_STATE_NONE.  So usually i_next_state is _NONE when ri_lock is not
63  *   held.  The exception is when we launch methods, which are done with
64  *   a separate thread.  To keep any other threads from grabbing ri_lock before
65  *   method_thread() does, we set ri_method_thread to the thread id of the
66  *   method thread, and when it is nonzero any thread with a different thread id
67  *   waits on ri_method_cv.
68  *
69  * Method execution is serialized by blocking on ri_method_cv in
70  * inst_lookup_by_id() and waiting for a 0 value of ri_method_thread.  This
71  * also prevents the instance structure from being deleted until all
72  * outstanding operations such as method_thread() have finished.
73  *
74  * Lock ordering:
75  *
76  * dgraph_lock [can be held when taking:]
77  *   utmpx_lock
78  *   dictionary->dict_lock
79  *   st->st_load_lock
80  *   wait_info_lock
81  *   ru->restarter_update_lock
82  *     restarter_queue->rpeq_lock
83  *   instance_list.ril_lock
84  *     inst->ri_lock
85  *   st->st_configd_live_lock
86  *
87  * instance_list.ril_lock
88  *   graph_queue->gpeq_lock
89  *   gu->gu_lock
90  *   st->st_configd_live_lock
91  *   dictionary->dict_lock
92  *   inst->ri_lock
93  *     graph_queue->gpeq_lock
94  *     gu->gu_lock
95  *     tu->tu_lock
96  *     tq->tq_lock
97  *     inst->ri_queue_lock
98  *       wait_info_lock
99  *       bp->cb_lock
100  *     utmpx_lock
101  *
102  * single_user_thread_lock
103  *   wait_info_lock
104  *   utmpx_lock
105  *
106  * gu_freeze_lock
107  *
108  * logbuf_mutex nests inside pretty much everything.
109  */
110 
111 #include <sys/contract/process.h>
112 #include <sys/ctfs.h>
113 #include <sys/stat.h>
114 #include <sys/time.h>
115 #include <sys/types.h>
116 #include <sys/uio.h>
117 #include <sys/wait.h>
118 #include <assert.h>
119 #include <errno.h>
120 #include <fcntl.h>
121 #include <libcontract.h>
122 #include <libcontract_priv.h>
123 #include <libintl.h>
124 #include <librestart.h>
125 #include <librestart_priv.h>
126 #include <libuutil.h>
127 #include <limits.h>
128 #include <poll.h>
129 #include <port.h>
130 #include <pthread.h>
131 #include <stdarg.h>
132 #include <stdio.h>
133 #include <strings.h>
134 #include <unistd.h>
135 
136 #include "startd.h"
137 #include "protocol.h"
138 
139 static uu_list_pool_t *restarter_instance_pool;
140 static restarter_instance_list_t instance_list;
141 
142 static uu_list_pool_t *restarter_queue_pool;
143 
144 /*ARGSUSED*/
145 static int
146 restarter_instance_compare(const void *lc_arg, const void *rc_arg,
147     void *private)
148 {
149 	int lc_id = ((const restarter_inst_t *)lc_arg)->ri_id;
150 	int rc_id = *(int *)rc_arg;
151 
152 	if (lc_id > rc_id)
153 		return (1);
154 	if (lc_id < rc_id)
155 		return (-1);
156 	return (0);
157 }
158 
159 static restarter_inst_t *
160 inst_lookup_by_name(const char *name)
161 {
162 	int id;
163 
164 	id = dict_lookup_byname(name);
165 	if (id == -1)
166 		return (NULL);
167 
168 	return (inst_lookup_by_id(id));
169 }
170 
171 restarter_inst_t *
172 inst_lookup_by_id(int id)
173 {
174 	restarter_inst_t *inst;
175 
176 	MUTEX_LOCK(&instance_list.ril_lock);
177 	inst = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
178 	if (inst != NULL)
179 		MUTEX_LOCK(&inst->ri_lock);
180 	MUTEX_UNLOCK(&instance_list.ril_lock);
181 
182 	if (inst != NULL) {
183 		while (inst->ri_method_thread != 0 &&
184 		    !pthread_equal(inst->ri_method_thread, pthread_self())) {
185 			++inst->ri_method_waiters;
186 			(void) pthread_cond_wait(&inst->ri_method_cv,
187 			    &inst->ri_lock);
188 			assert(inst->ri_method_waiters > 0);
189 			--inst->ri_method_waiters;
190 		}
191 	}
192 
193 	return (inst);
194 }
195 
196 static restarter_inst_t *
197 inst_lookup_queue(const char *name)
198 {
199 	int id;
200 	restarter_inst_t *inst;
201 
202 	id = dict_lookup_byname(name);
203 	if (id == -1)
204 		return (NULL);
205 
206 	MUTEX_LOCK(&instance_list.ril_lock);
207 	inst = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
208 	if (inst != NULL)
209 		MUTEX_LOCK(&inst->ri_queue_lock);
210 	MUTEX_UNLOCK(&instance_list.ril_lock);
211 
212 	return (inst);
213 }
214 
215 const char *
216 service_style(int flags)
217 {
218 	switch (flags & RINST_STYLE_MASK) {
219 	case RINST_CONTRACT:	return ("contract");
220 	case RINST_TRANSIENT:	return ("transient");
221 	case RINST_WAIT:	return ("wait");
222 
223 	default:
224 #ifndef NDEBUG
225 		uu_warn("%s:%d: Bad flags 0x%x.\n", __FILE__, __LINE__, flags);
226 #endif
227 		abort();
228 		/* NOTREACHED */
229 	}
230 }
231 
232 /*
233  * Fails with ECONNABORTED or ECANCELED.
234  */
235 static int
236 check_contract(restarter_inst_t *inst, boolean_t primary,
237     scf_instance_t *scf_inst)
238 {
239 	ctid_t *ctidp;
240 	int fd, r;
241 
242 	ctidp = primary ? &inst->ri_i.i_primary_ctid :
243 	    &inst->ri_i.i_transient_ctid;
244 
245 	assert(*ctidp >= 1);
246 
247 	fd = contract_open(*ctidp, NULL, "status", O_RDONLY);
248 	if (fd >= 0) {
249 		r = close(fd);
250 		assert(r == 0);
251 		return (0);
252 	}
253 
254 	r = restarter_remove_contract(scf_inst, *ctidp, primary ?
255 	    RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT);
256 	switch (r) {
257 	case 0:
258 	case ECONNABORTED:
259 	case ECANCELED:
260 		*ctidp = 0;
261 		return (r);
262 
263 	case ENOMEM:
264 		uu_die("Out of memory\n");
265 		/* NOTREACHED */
266 
267 	case EPERM:
268 		uu_die("Insufficient privilege.\n");
269 		/* NOTREACHED */
270 
271 	case EACCES:
272 		uu_die("Repository backend access denied.\n");
273 		/* NOTREACHED */
274 
275 	case EROFS:
276 		log_error(LOG_INFO, "Could not remove unusable contract id %ld "
277 		    "for %s from repository.\n", *ctidp, inst->ri_i.i_fmri);
278 		return (0);
279 
280 	case EINVAL:
281 	case EBADF:
282 	default:
283 		assert(0);
284 		abort();
285 		/* NOTREACHED */
286 	}
287 }
288 
289 static int stop_instance(scf_handle_t *, restarter_inst_t *, stop_cause_t);
290 
291 /*
292  * int restarter_insert_inst(scf_handle_t *, char *)
293  *   If the inst is already in the restarter list, return its id.  If the inst
294  *   is not in the restarter list, initialize a restarter_inst_t, initialize its
295  *   states, insert it into the list, and return 0.
296  *
297  *   Fails with
298  *     ENOENT - name is not in the repository
299  */
300 static int
301 restarter_insert_inst(scf_handle_t *h, const char *name)
302 {
303 	int id, r;
304 	restarter_inst_t *inst;
305 	uu_list_index_t idx;
306 	scf_service_t *scf_svc;
307 	scf_instance_t *scf_inst;
308 	scf_snapshot_t *snap = NULL;
309 	scf_propertygroup_t *pg;
310 	char *svc_name, *inst_name;
311 	char logfilebuf[PATH_MAX];
312 	char *c;
313 	boolean_t do_commit_states;
314 	restarter_instance_state_t state, next_state;
315 	protocol_states_t *ps;
316 	pid_t start_pid;
317 
318 	MUTEX_LOCK(&instance_list.ril_lock);
319 
320 	/*
321 	 * We don't use inst_lookup_by_name() here because we want the lookup
322 	 * & insert to be atomic.
323 	 */
324 	id = dict_lookup_byname(name);
325 	if (id != -1) {
326 		inst = uu_list_find(instance_list.ril_instance_list, &id, NULL,
327 		    &idx);
328 		if (inst != NULL) {
329 			MUTEX_UNLOCK(&instance_list.ril_lock);
330 			return (0);
331 		}
332 	}
333 
334 	/* Allocate an instance */
335 	inst = startd_zalloc(sizeof (restarter_inst_t));
336 	inst->ri_utmpx_prefix = startd_alloc(max_scf_value_size);
337 	inst->ri_utmpx_prefix[0] = '\0';
338 
339 	inst->ri_i.i_fmri = startd_alloc(strlen(name) + 1);
340 	(void) strcpy((char *)inst->ri_i.i_fmri, name);
341 
342 	inst->ri_queue = startd_list_create(restarter_queue_pool, inst, 0);
343 
344 	/*
345 	 * id shouldn't be -1 since we use the same dictionary as graph.c, but
346 	 * just in case.
347 	 */
348 	inst->ri_id = (id != -1 ? id : dict_insert(name));
349 
350 	special_online_hooks_get(name, &inst->ri_pre_online_hook,
351 	    &inst->ri_post_online_hook, &inst->ri_post_offline_hook);
352 
353 	scf_svc = safe_scf_service_create(h);
354 	scf_inst = safe_scf_instance_create(h);
355 	pg = safe_scf_pg_create(h);
356 	svc_name = startd_alloc(max_scf_name_size);
357 	inst_name = startd_alloc(max_scf_name_size);
358 
359 rep_retry:
360 	if (snap != NULL)
361 		scf_snapshot_destroy(snap);
362 	if (inst->ri_logstem != NULL)
363 		startd_free(inst->ri_logstem, PATH_MAX);
364 	if (inst->ri_common_name != NULL)
365 		startd_free(inst->ri_common_name, max_scf_value_size);
366 	if (inst->ri_C_common_name != NULL)
367 		startd_free(inst->ri_C_common_name, max_scf_value_size);
368 	snap = NULL;
369 	inst->ri_logstem = NULL;
370 	inst->ri_common_name = NULL;
371 	inst->ri_C_common_name = NULL;
372 
373 	if (scf_handle_decode_fmri(h, name, NULL, scf_svc, scf_inst, NULL,
374 	    NULL, SCF_DECODE_FMRI_EXACT) != 0) {
375 		switch (scf_error()) {
376 		case SCF_ERROR_CONNECTION_BROKEN:
377 			libscf_handle_rebind(h);
378 			goto rep_retry;
379 
380 		case SCF_ERROR_NOT_FOUND:
381 			goto deleted;
382 		}
383 
384 		uu_die("Can't decode FMRI %s: %s\n", name,
385 		    scf_strerror(scf_error()));
386 	}
387 
388 	/*
389 	 * If there's no running snapshot, then we execute using the editing
390 	 * snapshot.  Pending snapshots will be taken later.
391 	 */
392 	snap = libscf_get_running_snapshot(scf_inst);
393 
394 	if ((scf_service_get_name(scf_svc, svc_name, max_scf_name_size) < 0) ||
395 	    (scf_instance_get_name(scf_inst, inst_name, max_scf_name_size) <
396 	    0)) {
397 		switch (scf_error()) {
398 		case SCF_ERROR_NOT_SET:
399 			break;
400 
401 		case SCF_ERROR_CONNECTION_BROKEN:
402 			libscf_handle_rebind(h);
403 			goto rep_retry;
404 
405 		default:
406 			assert(0);
407 			abort();
408 		}
409 
410 		goto deleted;
411 	}
412 
413 	(void) snprintf(logfilebuf, PATH_MAX, "%s:%s", svc_name, inst_name);
414 	for (c = logfilebuf; *c != '\0'; c++)
415 		if (*c == '/')
416 			*c = '-';
417 
418 	inst->ri_logstem = startd_alloc(PATH_MAX);
419 	(void) snprintf(inst->ri_logstem, PATH_MAX, "%s%s", logfilebuf,
420 	    LOG_SUFFIX);
421 
422 	/*
423 	 * If the restarter group is missing, use uninit/none.  Otherwise,
424 	 * we're probably being restarted & don't want to mess up the states
425 	 * that are there.
426 	 */
427 	state = RESTARTER_STATE_UNINIT;
428 	next_state = RESTARTER_STATE_NONE;
429 
430 	r = scf_instance_get_pg(scf_inst, SCF_PG_RESTARTER, pg);
431 	if (r != 0) {
432 		switch (scf_error()) {
433 		case SCF_ERROR_CONNECTION_BROKEN:
434 			libscf_handle_rebind(h);
435 			goto rep_retry;
436 
437 		case SCF_ERROR_NOT_SET:
438 			goto deleted;
439 
440 		case SCF_ERROR_NOT_FOUND:
441 			/*
442 			 * This shouldn't happen since the graph engine should
443 			 * have initialized the state to uninitialized/none if
444 			 * there was no restarter pg.  In case somebody
445 			 * deleted it, though....
446 			 */
447 			do_commit_states = B_TRUE;
448 			break;
449 
450 		default:
451 			assert(0);
452 			abort();
453 		}
454 	} else {
455 		r = libscf_read_states(pg, &state, &next_state);
456 		if (r != 0) {
457 			do_commit_states = B_TRUE;
458 		} else {
459 			if (next_state != RESTARTER_STATE_NONE) {
460 				/*
461 				 * Force next_state to _NONE since we
462 				 * don't look for method processes.
463 				 */
464 				next_state = RESTARTER_STATE_NONE;
465 				do_commit_states = B_TRUE;
466 			} else {
467 				/*
468 				 * Inform the restarter of our state without
469 				 * changing the STIME in the repository.
470 				 */
471 				ps = startd_alloc(sizeof (*ps));
472 				inst->ri_i.i_state = ps->ps_state = state;
473 				inst->ri_i.i_next_state = ps->ps_state_next =
474 				    next_state;
475 
476 				graph_protocol_send_event(inst->ri_i.i_fmri,
477 				    GRAPH_UPDATE_STATE_CHANGE, ps);
478 
479 				do_commit_states = B_FALSE;
480 			}
481 		}
482 	}
483 
484 	switch (libscf_get_startd_properties(scf_inst, snap, &inst->ri_flags,
485 	    &inst->ri_utmpx_prefix)) {
486 	case 0:
487 		break;
488 
489 	case ECONNABORTED:
490 		libscf_handle_rebind(h);
491 		goto rep_retry;
492 
493 	case ECANCELED:
494 		goto deleted;
495 
496 	case ENOENT:
497 		/*
498 		 * This is odd, because the graph engine should have required
499 		 * the general property group.  So we'll just use default
500 		 * flags in anticipation of the graph engine sending us
501 		 * REMOVE_INSTANCE when it finds out that the general property
502 		 * group has been deleted.
503 		 */
504 		inst->ri_flags = RINST_CONTRACT;
505 		break;
506 
507 	default:
508 		assert(0);
509 		abort();
510 	}
511 
512 	switch (libscf_get_template_values(scf_inst, snap,
513 	    &inst->ri_common_name, &inst->ri_C_common_name)) {
514 	case 0:
515 		break;
516 
517 	case ECONNABORTED:
518 		libscf_handle_rebind(h);
519 		goto rep_retry;
520 
521 	case ECANCELED:
522 		goto deleted;
523 
524 	case ECHILD:
525 	case ENOENT:
526 		break;
527 
528 	default:
529 		assert(0);
530 		abort();
531 	}
532 
533 	switch (libscf_read_method_ids(h, scf_inst, inst->ri_i.i_fmri,
534 	    &inst->ri_i.i_primary_ctid, &inst->ri_i.i_transient_ctid,
535 	    &start_pid)) {
536 	case 0:
537 		break;
538 
539 	case ECONNABORTED:
540 		libscf_handle_rebind(h);
541 		goto rep_retry;
542 
543 	case ECANCELED:
544 		goto deleted;
545 
546 	default:
547 		assert(0);
548 		abort();
549 	}
550 
551 	if (inst->ri_i.i_primary_ctid >= 1) {
552 		contract_hash_store(inst->ri_i.i_primary_ctid, inst->ri_id);
553 
554 		switch (check_contract(inst, B_TRUE, scf_inst)) {
555 		case 0:
556 			break;
557 
558 		case ECONNABORTED:
559 			libscf_handle_rebind(h);
560 			goto rep_retry;
561 
562 		case ECANCELED:
563 			goto deleted;
564 
565 		default:
566 			assert(0);
567 			abort();
568 		}
569 	}
570 
571 	if (inst->ri_i.i_transient_ctid >= 1) {
572 		switch (check_contract(inst, B_FALSE, scf_inst)) {
573 		case 0:
574 			break;
575 
576 		case ECONNABORTED:
577 			libscf_handle_rebind(h);
578 			goto rep_retry;
579 
580 		case ECANCELED:
581 			goto deleted;
582 
583 		default:
584 			assert(0);
585 			abort();
586 		}
587 	}
588 
589 	/* No more failures we live through, so add it to the list. */
590 	(void) pthread_mutex_init(&inst->ri_lock, &mutex_attrs);
591 	(void) pthread_mutex_init(&inst->ri_queue_lock, &mutex_attrs);
592 	MUTEX_LOCK(&inst->ri_lock);
593 	MUTEX_LOCK(&inst->ri_queue_lock);
594 
595 	(void) pthread_cond_init(&inst->ri_method_cv, NULL);
596 
597 	uu_list_node_init(inst, &inst->ri_link, restarter_instance_pool);
598 	uu_list_insert(instance_list.ril_instance_list, inst, idx);
599 	MUTEX_UNLOCK(&instance_list.ril_lock);
600 
601 	if (start_pid != -1 &&
602 	    (inst->ri_flags & RINST_STYLE_MASK) == RINST_WAIT) {
603 		int ret;
604 		ret = wait_register(start_pid, inst->ri_i.i_fmri, 0, 1);
605 		if (ret == -1) {
606 			/*
607 			 * Implication:  if we can't reregister the
608 			 * instance, we will start another one.  Two
609 			 * instances may or may not result in a resource
610 			 * conflict.
611 			 */
612 			log_error(LOG_WARNING,
613 			    "%s: couldn't reregister %ld for wait\n",
614 			    inst->ri_i.i_fmri, start_pid);
615 		} else if (ret == 1) {
616 			/*
617 			 * Leading PID has exited.
618 			 */
619 			(void) stop_instance(h, inst, RSTOP_EXIT);
620 		}
621 	}
622 
623 
624 	scf_pg_destroy(pg);
625 
626 	if (do_commit_states)
627 		(void) restarter_instance_update_states(h, inst, state,
628 		    next_state, RERR_NONE, NULL);
629 
630 	log_framework(LOG_DEBUG, "%s is a %s-style service\n", name,
631 	    service_style(inst->ri_flags));
632 
633 	MUTEX_UNLOCK(&inst->ri_queue_lock);
634 	MUTEX_UNLOCK(&inst->ri_lock);
635 
636 	startd_free(svc_name, max_scf_name_size);
637 	startd_free(inst_name, max_scf_name_size);
638 	scf_snapshot_destroy(snap);
639 	scf_instance_destroy(scf_inst);
640 	scf_service_destroy(scf_svc);
641 
642 	log_framework(LOG_DEBUG, "%s: inserted instance into restarter list\n",
643 	    name);
644 
645 	return (0);
646 
647 deleted:
648 	MUTEX_UNLOCK(&instance_list.ril_lock);
649 	startd_free(inst_name, max_scf_name_size);
650 	startd_free(svc_name, max_scf_name_size);
651 	if (snap != NULL)
652 		scf_snapshot_destroy(snap);
653 	scf_pg_destroy(pg);
654 	scf_instance_destroy(scf_inst);
655 	scf_service_destroy(scf_svc);
656 	startd_free((void *)inst->ri_i.i_fmri, strlen(inst->ri_i.i_fmri) + 1);
657 	uu_list_destroy(inst->ri_queue);
658 	if (inst->ri_logstem != NULL)
659 		startd_free(inst->ri_logstem, PATH_MAX);
660 	if (inst->ri_common_name != NULL)
661 		startd_free(inst->ri_common_name, max_scf_value_size);
662 	if (inst->ri_C_common_name != NULL)
663 		startd_free(inst->ri_C_common_name, max_scf_value_size);
664 	startd_free(inst->ri_utmpx_prefix, max_scf_value_size);
665 	startd_free(inst, sizeof (restarter_inst_t));
666 	return (ENOENT);
667 }
668 
669 static void
670 restarter_delete_inst(restarter_inst_t *ri)
671 {
672 	int id;
673 	restarter_inst_t *rip;
674 	void *cookie = NULL;
675 	restarter_instance_qentry_t *e;
676 
677 	assert(PTHREAD_MUTEX_HELD(&ri->ri_lock));
678 
679 	/*
680 	 * Must drop the instance lock so we can pick up the instance_list
681 	 * lock & remove the instance.
682 	 */
683 	id = ri->ri_id;
684 	MUTEX_UNLOCK(&ri->ri_lock);
685 
686 	MUTEX_LOCK(&instance_list.ril_lock);
687 
688 	rip = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
689 	if (rip == NULL) {
690 		MUTEX_UNLOCK(&instance_list.ril_lock);
691 		return;
692 	}
693 
694 	assert(ri == rip);
695 
696 	uu_list_remove(instance_list.ril_instance_list, ri);
697 
698 	log_framework(LOG_DEBUG, "%s: deleted instance from restarter list\n",
699 	    ri->ri_i.i_fmri);
700 
701 	MUTEX_UNLOCK(&instance_list.ril_lock);
702 
703 	/*
704 	 * We can lock the instance without holding the instance_list lock
705 	 * since we removed the instance from the list.
706 	 */
707 	MUTEX_LOCK(&ri->ri_lock);
708 	MUTEX_LOCK(&ri->ri_queue_lock);
709 
710 	if (ri->ri_i.i_primary_ctid >= 1)
711 		contract_hash_remove(ri->ri_i.i_primary_ctid);
712 
713 	while (ri->ri_method_thread != 0 || ri->ri_method_waiters > 0)
714 		(void) pthread_cond_wait(&ri->ri_method_cv, &ri->ri_lock);
715 
716 	while ((e = uu_list_teardown(ri->ri_queue, &cookie)) != NULL)
717 		startd_free(e, sizeof (*e));
718 	uu_list_destroy(ri->ri_queue);
719 
720 	startd_free((void *)ri->ri_i.i_fmri, strlen(ri->ri_i.i_fmri) + 1);
721 	startd_free(ri->ri_logstem, PATH_MAX);
722 	if (ri->ri_common_name != NULL)
723 		startd_free(ri->ri_common_name, max_scf_value_size);
724 	if (ri->ri_C_common_name != NULL)
725 		startd_free(ri->ri_C_common_name, max_scf_value_size);
726 	startd_free(ri->ri_utmpx_prefix, max_scf_value_size);
727 	(void) pthread_mutex_destroy(&ri->ri_lock);
728 	(void) pthread_mutex_destroy(&ri->ri_queue_lock);
729 	startd_free(ri, sizeof (restarter_inst_t));
730 }
731 
732 /*
733  * instance_is_wait_style()
734  *
735  *   Returns 1 if the given instance is a "wait-style" service instance.
736  */
737 int
738 instance_is_wait_style(restarter_inst_t *inst)
739 {
740 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
741 	return ((inst->ri_flags & RINST_STYLE_MASK) == RINST_WAIT);
742 }
743 
744 /*
745  * instance_is_transient_style()
746  *
747  *   Returns 1 if the given instance is a transient service instance.
748  */
749 int
750 instance_is_transient_style(restarter_inst_t *inst)
751 {
752 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
753 	return ((inst->ri_flags & RINST_STYLE_MASK) == RINST_TRANSIENT);
754 }
755 
756 /*
757  * instance_in_transition()
758  * Returns 1 if instance is in transition, 0 if not
759  */
760 int
761 instance_in_transition(restarter_inst_t *inst)
762 {
763 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
764 	if (inst->ri_i.i_next_state == RESTARTER_STATE_NONE)
765 		return (0);
766 	return (1);
767 }
768 
769 /*
770  * returns 1 if instance is already started, 0 if not
771  */
772 static int
773 instance_started(restarter_inst_t *inst)
774 {
775 	int ret;
776 
777 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
778 
779 	if (inst->ri_i.i_state == RESTARTER_STATE_ONLINE ||
780 	    inst->ri_i.i_state == RESTARTER_STATE_DEGRADED)
781 		ret = 1;
782 	else
783 		ret = 0;
784 
785 	return (ret);
786 }
787 
788 /*
789  * Returns
790  *   0 - success
791  *   ECONNRESET - success, but h was rebound
792  */
793 int
794 restarter_instance_update_states(scf_handle_t *h, restarter_inst_t *ri,
795     restarter_instance_state_t new_state,
796     restarter_instance_state_t new_state_next, restarter_error_t err, char *aux)
797 {
798 	protocol_states_t *states;
799 	int e;
800 	uint_t retry_count = 0, msecs = ALLOC_DELAY;
801 	boolean_t rebound = B_FALSE;
802 	int prev_state_online;
803 	int state_online;
804 
805 	assert(PTHREAD_MUTEX_HELD(&ri->ri_lock));
806 
807 	prev_state_online = instance_started(ri);
808 
809 retry:
810 	e = _restarter_commit_states(h, &ri->ri_i, new_state, new_state_next,
811 	    aux);
812 	switch (e) {
813 	case 0:
814 		break;
815 
816 	case ENOMEM:
817 		++retry_count;
818 		if (retry_count < ALLOC_RETRY) {
819 			(void) poll(NULL, 0, msecs);
820 			msecs *= ALLOC_DELAY_MULT;
821 			goto retry;
822 		}
823 
824 		/* Like startd_alloc(). */
825 		uu_die("Insufficient memory.\n");
826 		/* NOTREACHED */
827 
828 	case ECONNABORTED:
829 		libscf_handle_rebind(h);
830 		rebound = B_TRUE;
831 		goto retry;
832 
833 	case EPERM:
834 	case EACCES:
835 	case EROFS:
836 		log_error(LOG_NOTICE, "Could not commit state change for %s "
837 		    "to repository: %s.\n", ri->ri_i.i_fmri, strerror(e));
838 		/* FALLTHROUGH */
839 
840 	case ENOENT:
841 		ri->ri_i.i_state = new_state;
842 		ri->ri_i.i_next_state = new_state_next;
843 		break;
844 
845 	case EINVAL:
846 	default:
847 		bad_error("_restarter_commit_states", e);
848 	}
849 
850 	states = startd_alloc(sizeof (protocol_states_t));
851 	states->ps_state = new_state;
852 	states->ps_state_next = new_state_next;
853 	states->ps_err = err;
854 	graph_protocol_send_event(ri->ri_i.i_fmri, GRAPH_UPDATE_STATE_CHANGE,
855 	    (void *)states);
856 
857 	state_online = instance_started(ri);
858 
859 	if (prev_state_online && !state_online)
860 		ri->ri_post_offline_hook();
861 	else if (!prev_state_online && state_online)
862 		ri->ri_post_online_hook();
863 
864 	return (rebound ? ECONNRESET : 0);
865 }
866 
867 void
868 restarter_mark_pending_snapshot(const char *fmri, uint_t flag)
869 {
870 	restarter_inst_t *inst;
871 
872 	assert(flag == RINST_RETAKE_RUNNING || flag == RINST_RETAKE_START);
873 
874 	inst = inst_lookup_by_name(fmri);
875 	if (inst == NULL)
876 		return;
877 
878 	inst->ri_flags |= flag;
879 
880 	MUTEX_UNLOCK(&inst->ri_lock);
881 }
882 
883 static void
884 restarter_take_pending_snapshots(scf_handle_t *h)
885 {
886 	restarter_inst_t *inst;
887 	int r;
888 
889 	MUTEX_LOCK(&instance_list.ril_lock);
890 
891 	for (inst = uu_list_first(instance_list.ril_instance_list);
892 	    inst != NULL;
893 	    inst = uu_list_next(instance_list.ril_instance_list, inst)) {
894 		const char *fmri;
895 		scf_instance_t *sinst = NULL;
896 
897 		MUTEX_LOCK(&inst->ri_lock);
898 
899 		/*
900 		 * This is where we'd check inst->ri_method_thread and if it
901 		 * were nonzero we'd wait in anticipation of another thread
902 		 * executing a method for inst.  Doing so with the instance_list
903 		 * locked, though, leads to deadlock.  Since taking a snapshot
904 		 * during that window won't hurt anything, we'll just continue.
905 		 */
906 
907 		fmri = inst->ri_i.i_fmri;
908 
909 		if (inst->ri_flags & RINST_RETAKE_RUNNING) {
910 			scf_snapshot_t *rsnap;
911 
912 			(void) libscf_fmri_get_instance(h, fmri, &sinst);
913 
914 			rsnap = libscf_get_or_make_running_snapshot(sinst,
915 			    fmri, B_FALSE);
916 
917 			scf_instance_destroy(sinst);
918 
919 			if (rsnap != NULL)
920 				inst->ri_flags &= ~RINST_RETAKE_RUNNING;
921 
922 			scf_snapshot_destroy(rsnap);
923 		}
924 
925 		if (inst->ri_flags & RINST_RETAKE_START) {
926 			switch (r = libscf_snapshots_poststart(h, fmri,
927 			    B_FALSE)) {
928 			case 0:
929 			case ENOENT:
930 				inst->ri_flags &= ~RINST_RETAKE_START;
931 				break;
932 
933 			case ECONNABORTED:
934 				break;
935 
936 			case EACCES:
937 			default:
938 				bad_error("libscf_snapshots_poststart", r);
939 			}
940 		}
941 
942 		MUTEX_UNLOCK(&inst->ri_lock);
943 	}
944 
945 	MUTEX_UNLOCK(&instance_list.ril_lock);
946 }
947 
948 /* ARGSUSED */
949 void *
950 restarter_post_fsminimal_thread(void *unused)
951 {
952 	scf_handle_t *h;
953 	int r;
954 
955 	h = libscf_handle_create_bound_loop();
956 
957 	for (;;) {
958 		r = libscf_create_self(h);
959 		if (r == 0)
960 			break;
961 
962 		assert(r == ECONNABORTED);
963 		libscf_handle_rebind(h);
964 	}
965 
966 	restarter_take_pending_snapshots(h);
967 
968 	(void) scf_handle_unbind(h);
969 	scf_handle_destroy(h);
970 
971 	return (NULL);
972 }
973 
974 /*
975  * int stop_instance()
976  *
977  *   Stop the instance identified by the instance given as the second argument,
978  *   for the cause stated.
979  *
980  *   Returns
981  *     0 - success
982  *     -1 - inst is in transition
983  */
984 static int
985 stop_instance(scf_handle_t *local_handle, restarter_inst_t *inst,
986     stop_cause_t cause)
987 {
988 	fork_info_t *info;
989 	const char *cp;
990 	int err;
991 	restarter_error_t re;
992 
993 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
994 	assert(inst->ri_method_thread == 0);
995 
996 	switch (cause) {
997 	case RSTOP_EXIT:
998 		re = RERR_RESTART;
999 		cp = "all processes in service exited";
1000 		break;
1001 	case RSTOP_CORE:
1002 		re = RERR_FAULT;
1003 		cp = "process dumped core";
1004 		break;
1005 	case RSTOP_SIGNAL:
1006 		re = RERR_FAULT;
1007 		cp = "process received fatal signal from outside the service";
1008 		break;
1009 	case RSTOP_HWERR:
1010 		re = RERR_FAULT;
1011 		cp = "process killed due to uncorrectable hardware error";
1012 		break;
1013 	case RSTOP_DEPENDENCY:
1014 		re = RERR_RESTART;
1015 		cp = "dependency activity requires stop";
1016 		break;
1017 	case RSTOP_DISABLE:
1018 		re = RERR_RESTART;
1019 		cp = "service disabled";
1020 		break;
1021 	case RSTOP_RESTART:
1022 		re = RERR_RESTART;
1023 		cp = "service restarting";
1024 		break;
1025 	default:
1026 #ifndef NDEBUG
1027 		(void) fprintf(stderr, "Unknown cause %d at %s:%d.\n",
1028 		    cause, __FILE__, __LINE__);
1029 #endif
1030 		abort();
1031 	}
1032 
1033 	/* Services in the disabled and maintenance state are ignored */
1034 	if (inst->ri_i.i_state == RESTARTER_STATE_MAINT ||
1035 	    inst->ri_i.i_state == RESTARTER_STATE_DISABLED) {
1036 		log_framework(LOG_DEBUG,
1037 		    "%s: stop_instance -> is maint/disabled\n",
1038 		    inst->ri_i.i_fmri);
1039 		return (0);
1040 	}
1041 
1042 	/* Already stopped instances are left alone */
1043 	if (instance_started(inst) == 0) {
1044 		log_framework(LOG_DEBUG, "Restarter: %s is already stopped.\n",
1045 		    inst->ri_i.i_fmri);
1046 		return (0);
1047 	}
1048 
1049 	if (instance_in_transition(inst)) {
1050 		/* requeue event by returning -1 */
1051 		log_framework(LOG_DEBUG,
1052 		    "Restarter: Not stopping %s, in transition.\n",
1053 		    inst->ri_i.i_fmri);
1054 		return (-1);
1055 	}
1056 
1057 	log_instance(inst, B_TRUE, "Stopping because %s.", cp);
1058 
1059 	log_framework(re == RERR_FAULT ? LOG_INFO : LOG_DEBUG,
1060 	    "%s: Instance stopping because %s.\n", inst->ri_i.i_fmri, cp);
1061 
1062 	if (instance_is_wait_style(inst) && cause == RSTOP_EXIT) {
1063 		/*
1064 		 * No need to stop instance, as child has exited; remove
1065 		 * contract and move the instance to the offline state.
1066 		 */
1067 		switch (err = restarter_instance_update_states(local_handle,
1068 		    inst, inst->ri_i.i_state, RESTARTER_STATE_OFFLINE, re,
1069 		    NULL)) {
1070 		case 0:
1071 		case ECONNRESET:
1072 			break;
1073 
1074 		default:
1075 			bad_error("restarter_instance_update_states", err);
1076 		}
1077 
1078 		(void) update_fault_count(inst, FAULT_COUNT_RESET);
1079 
1080 		if (inst->ri_i.i_primary_ctid != 0) {
1081 			inst->ri_m_inst =
1082 			    safe_scf_instance_create(local_handle);
1083 			inst->ri_mi_deleted = B_FALSE;
1084 
1085 			libscf_reget_instance(inst);
1086 			method_remove_contract(inst, B_TRUE, B_TRUE);
1087 
1088 			scf_instance_destroy(inst->ri_m_inst);
1089 			inst->ri_m_inst = NULL;
1090 		}
1091 
1092 		switch (err = restarter_instance_update_states(local_handle,
1093 		    inst, inst->ri_i.i_next_state, RESTARTER_STATE_NONE, re,
1094 		    NULL)) {
1095 		case 0:
1096 		case ECONNRESET:
1097 			break;
1098 
1099 		default:
1100 			bad_error("restarter_instance_update_states", err);
1101 		}
1102 
1103 		return (0);
1104 	} else if (instance_is_wait_style(inst) && re == RERR_RESTART) {
1105 		/*
1106 		 * Stopping a wait service through means other than the pid
1107 		 * exiting should keep wait_thread() from restarting the
1108 		 * service, by removing it from the wait list.
1109 		 * We cannot remove it right now otherwise the process will
1110 		 * end up <defunct> so mark it to be ignored.
1111 		 */
1112 		wait_ignore_by_fmri(inst->ri_i.i_fmri);
1113 	}
1114 
1115 	switch (err = restarter_instance_update_states(local_handle, inst,
1116 	    inst->ri_i.i_state, inst->ri_i.i_enabled ? RESTARTER_STATE_OFFLINE :
1117 	    RESTARTER_STATE_DISABLED, RERR_NONE, NULL)) {
1118 	case 0:
1119 	case ECONNRESET:
1120 		break;
1121 
1122 	default:
1123 		bad_error("restarter_instance_update_states", err);
1124 	}
1125 
1126 	info = startd_zalloc(sizeof (fork_info_t));
1127 
1128 	info->sf_id = inst->ri_id;
1129 	info->sf_method_type = METHOD_STOP;
1130 	info->sf_event_type = re;
1131 	inst->ri_method_thread = startd_thread_create(method_thread, info);
1132 
1133 	return (0);
1134 }
1135 
1136 /*
1137  * Returns
1138  *   ENOENT - fmri is not in instance_list
1139  *   0 - success
1140  *   ECONNRESET - success, though handle was rebound
1141  *   -1 - instance is in transition
1142  */
1143 int
1144 stop_instance_fmri(scf_handle_t *h, const char *fmri, uint_t flags)
1145 {
1146 	restarter_inst_t *rip;
1147 	int r;
1148 
1149 	rip = inst_lookup_by_name(fmri);
1150 	if (rip == NULL)
1151 		return (ENOENT);
1152 
1153 	r = stop_instance(h, rip, flags);
1154 
1155 	MUTEX_UNLOCK(&rip->ri_lock);
1156 
1157 	return (r);
1158 }
1159 
1160 static void
1161 unmaintain_instance(scf_handle_t *h, restarter_inst_t *rip,
1162     unmaint_cause_t cause)
1163 {
1164 	ctid_t ctid;
1165 	scf_instance_t *inst;
1166 	int r;
1167 	uint_t tries = 0, msecs = ALLOC_DELAY;
1168 	const char *cp;
1169 
1170 	assert(PTHREAD_MUTEX_HELD(&rip->ri_lock));
1171 
1172 	if (rip->ri_i.i_state != RESTARTER_STATE_MAINT) {
1173 		log_error(LOG_DEBUG, "Restarter: "
1174 		    "Ignoring maintenance off command because %s is not in the "
1175 		    "maintenance state.\n", rip->ri_i.i_fmri);
1176 		return;
1177 	}
1178 
1179 	switch (cause) {
1180 	case RUNMAINT_CLEAR:
1181 		cp = "clear requested";
1182 		break;
1183 	case RUNMAINT_DISABLE:
1184 		cp = "disable requested";
1185 		break;
1186 	default:
1187 #ifndef NDEBUG
1188 		(void) fprintf(stderr, "Uncaught case for %d at %s:%d.\n",
1189 		    cause, __FILE__, __LINE__);
1190 #endif
1191 		abort();
1192 	}
1193 
1194 	log_instance(rip, B_TRUE, "Leaving maintenance because %s.",
1195 	    cp);
1196 	log_framework(LOG_DEBUG, "%s: Instance leaving maintenance because "
1197 	    "%s.\n", rip->ri_i.i_fmri, cp);
1198 
1199 	(void) restarter_instance_update_states(h, rip, RESTARTER_STATE_UNINIT,
1200 	    RESTARTER_STATE_NONE, RERR_RESTART, NULL);
1201 
1202 	/*
1203 	 * If we did ADMIN_MAINT_ON_IMMEDIATE, then there might still be
1204 	 * a primary contract.
1205 	 */
1206 	if (rip->ri_i.i_primary_ctid == 0)
1207 		return;
1208 
1209 	ctid = rip->ri_i.i_primary_ctid;
1210 	contract_abandon(ctid);
1211 	rip->ri_i.i_primary_ctid = 0;
1212 
1213 rep_retry:
1214 	switch (r = libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst)) {
1215 	case 0:
1216 		break;
1217 
1218 	case ECONNABORTED:
1219 		libscf_handle_rebind(h);
1220 		goto rep_retry;
1221 
1222 	case ENOENT:
1223 		/* Must have been deleted. */
1224 		return;
1225 
1226 	case EINVAL:
1227 	case ENOTSUP:
1228 	default:
1229 		bad_error("libscf_handle_rebind", r);
1230 	}
1231 
1232 again:
1233 	r = restarter_remove_contract(inst, ctid, RESTARTER_CONTRACT_PRIMARY);
1234 	switch (r) {
1235 	case 0:
1236 		break;
1237 
1238 	case ENOMEM:
1239 		++tries;
1240 		if (tries < ALLOC_RETRY) {
1241 			(void) poll(NULL, 0, msecs);
1242 			msecs *= ALLOC_DELAY_MULT;
1243 			goto again;
1244 		}
1245 
1246 		uu_die("Insufficient memory.\n");
1247 		/* NOTREACHED */
1248 
1249 	case ECONNABORTED:
1250 		scf_instance_destroy(inst);
1251 		libscf_handle_rebind(h);
1252 		goto rep_retry;
1253 
1254 	case ECANCELED:
1255 		break;
1256 
1257 	case EPERM:
1258 	case EACCES:
1259 	case EROFS:
1260 		log_error(LOG_INFO,
1261 		    "Could not remove contract id %lu for %s (%s).\n", ctid,
1262 		    rip->ri_i.i_fmri, strerror(r));
1263 		break;
1264 
1265 	case EINVAL:
1266 	case EBADF:
1267 	default:
1268 		bad_error("restarter_remove_contract", r);
1269 	}
1270 
1271 	scf_instance_destroy(inst);
1272 }
1273 
1274 /*
1275  * enable_inst()
1276  *   Set inst->ri_i.i_enabled.  Expects 'e' to be _ENABLE, _DISABLE, or
1277  *   _ADMIN_DISABLE.  If the event is _ENABLE and inst is uninitialized or
1278  *   disabled, move it to offline.  If the event is _DISABLE or
1279  *   _ADMIN_DISABLE, make sure inst will move to disabled.
1280  *
1281  *   Returns
1282  *     0 - success
1283  *     ECONNRESET - h was rebound
1284  */
1285 static int
1286 enable_inst(scf_handle_t *h, restarter_inst_t *inst, restarter_event_type_t e)
1287 {
1288 	restarter_instance_state_t state;
1289 	int r;
1290 
1291 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
1292 	assert(e == RESTARTER_EVENT_TYPE_ADMIN_DISABLE ||
1293 	    e == RESTARTER_EVENT_TYPE_DISABLE ||
1294 	    e == RESTARTER_EVENT_TYPE_ENABLE);
1295 	assert(instance_in_transition(inst) == 0);
1296 
1297 	state = inst->ri_i.i_state;
1298 
1299 	if (e == RESTARTER_EVENT_TYPE_ENABLE) {
1300 		inst->ri_i.i_enabled = 1;
1301 
1302 		if (state == RESTARTER_STATE_UNINIT ||
1303 		    state == RESTARTER_STATE_DISABLED) {
1304 			/*
1305 			 * B_FALSE: Don't log an error if the log_instance()
1306 			 * fails because it will fail on the miniroot before
1307 			 * install-discovery runs.
1308 			 */
1309 			log_instance(inst, B_FALSE, "Enabled.");
1310 			log_framework(LOG_DEBUG, "%s: Instance enabled.\n",
1311 			    inst->ri_i.i_fmri);
1312 			(void) restarter_instance_update_states(h, inst,
1313 			    RESTARTER_STATE_OFFLINE, RESTARTER_STATE_NONE,
1314 			    RERR_NONE, NULL);
1315 		} else {
1316 			log_framework(LOG_DEBUG, "Restarter: "
1317 			    "Not changing state of %s for enable command.\n",
1318 			    inst->ri_i.i_fmri);
1319 		}
1320 	} else {
1321 		inst->ri_i.i_enabled = 0;
1322 
1323 		switch (state) {
1324 		case RESTARTER_STATE_ONLINE:
1325 		case RESTARTER_STATE_DEGRADED:
1326 			r = stop_instance(h, inst, RSTOP_DISABLE);
1327 			return (r == ECONNRESET ? 0 : r);
1328 
1329 		case RESTARTER_STATE_OFFLINE:
1330 		case RESTARTER_STATE_UNINIT:
1331 			if (inst->ri_i.i_primary_ctid != 0) {
1332 				inst->ri_m_inst = safe_scf_instance_create(h);
1333 				inst->ri_mi_deleted = B_FALSE;
1334 
1335 				libscf_reget_instance(inst);
1336 				method_remove_contract(inst, B_TRUE, B_TRUE);
1337 
1338 				scf_instance_destroy(inst->ri_m_inst);
1339 			}
1340 			/* B_FALSE: See log_instance(..., "Enabled."); above */
1341 			log_instance(inst, B_FALSE, "Disabled.");
1342 			log_framework(LOG_DEBUG, "%s: Instance disabled.\n",
1343 			    inst->ri_i.i_fmri);
1344 			(void) restarter_instance_update_states(h, inst,
1345 			    RESTARTER_STATE_DISABLED, RESTARTER_STATE_NONE,
1346 			    RERR_RESTART, NULL);
1347 			return (0);
1348 
1349 		case RESTARTER_STATE_DISABLED:
1350 			break;
1351 
1352 		case RESTARTER_STATE_MAINT:
1353 			/*
1354 			 * We only want to pull the instance out of maintenance
1355 			 * if the disable is on adminstrative request.  The
1356 			 * graph engine sends _DISABLE events whenever a
1357 			 * service isn't in the disabled state, and we don't
1358 			 * want to pull the service out of maintenance if,
1359 			 * for example, it is there due to a dependency cycle.
1360 			 */
1361 			if (e == RESTARTER_EVENT_TYPE_ADMIN_DISABLE)
1362 				unmaintain_instance(h, inst, RUNMAINT_DISABLE);
1363 			break;
1364 
1365 		default:
1366 #ifndef NDEBUG
1367 			(void) fprintf(stderr, "Restarter instance %s has "
1368 			    "unknown state %d.\n", inst->ri_i.i_fmri, state);
1369 #endif
1370 			abort();
1371 		}
1372 	}
1373 
1374 	return (0);
1375 }
1376 
1377 static void
1378 start_instance(scf_handle_t *local_handle, restarter_inst_t *inst)
1379 {
1380 	fork_info_t *info;
1381 
1382 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
1383 	assert(instance_in_transition(inst) == 0);
1384 	assert(inst->ri_method_thread == 0);
1385 
1386 	log_framework(LOG_DEBUG, "%s: trying to start instance\n",
1387 	    inst->ri_i.i_fmri);
1388 
1389 	/* Services in the disabled and maintenance state are ignored */
1390 	if (inst->ri_i.i_state == RESTARTER_STATE_MAINT ||
1391 	    inst->ri_i.i_state == RESTARTER_STATE_DISABLED ||
1392 	    inst->ri_i.i_enabled == 0) {
1393 		log_framework(LOG_DEBUG,
1394 		    "%s: start_instance -> is maint/disabled\n",
1395 		    inst->ri_i.i_fmri);
1396 		return;
1397 	}
1398 
1399 	/* Already started instances are left alone */
1400 	if (instance_started(inst) == 1) {
1401 		log_framework(LOG_DEBUG,
1402 		    "%s: start_instance -> is already started\n",
1403 		    inst->ri_i.i_fmri);
1404 		return;
1405 	}
1406 
1407 	log_framework(LOG_DEBUG, "%s: starting instance.\n", inst->ri_i.i_fmri);
1408 
1409 	(void) restarter_instance_update_states(local_handle, inst,
1410 	    inst->ri_i.i_state, RESTARTER_STATE_ONLINE, RERR_NONE, NULL);
1411 
1412 	info = startd_zalloc(sizeof (fork_info_t));
1413 
1414 	info->sf_id = inst->ri_id;
1415 	info->sf_method_type = METHOD_START;
1416 	info->sf_event_type = RERR_NONE;
1417 	inst->ri_method_thread = startd_thread_create(method_thread, info);
1418 }
1419 
1420 static void
1421 maintain_instance(scf_handle_t *h, restarter_inst_t *rip, int immediate,
1422     const char *aux)
1423 {
1424 	fork_info_t *info;
1425 
1426 	assert(PTHREAD_MUTEX_HELD(&rip->ri_lock));
1427 	assert(aux != NULL);
1428 	assert(rip->ri_method_thread == 0);
1429 
1430 	log_instance(rip, B_TRUE, "Stopping for maintenance due to %s.", aux);
1431 	log_framework(LOG_DEBUG, "%s: stopping for maintenance due to %s.\n",
1432 	    rip->ri_i.i_fmri, aux);
1433 
1434 	/* Services in the maintenance state are ignored */
1435 	if (rip->ri_i.i_state == RESTARTER_STATE_MAINT) {
1436 		log_framework(LOG_DEBUG,
1437 		    "%s: maintain_instance -> is already in maintenance\n",
1438 		    rip->ri_i.i_fmri);
1439 		return;
1440 	}
1441 
1442 	if (immediate || !instance_started(rip)) {
1443 		if (rip->ri_i.i_primary_ctid != 0) {
1444 			rip->ri_m_inst = safe_scf_instance_create(h);
1445 			rip->ri_mi_deleted = B_FALSE;
1446 
1447 			libscf_reget_instance(rip);
1448 			method_remove_contract(rip, B_TRUE, B_TRUE);
1449 
1450 			scf_instance_destroy(rip->ri_m_inst);
1451 		}
1452 
1453 		(void) restarter_instance_update_states(h, rip,
1454 		    RESTARTER_STATE_MAINT, RESTARTER_STATE_NONE, RERR_RESTART,
1455 		    (char *)aux);
1456 		return;
1457 	}
1458 
1459 	(void) restarter_instance_update_states(h, rip, rip->ri_i.i_state,
1460 	    RESTARTER_STATE_MAINT, RERR_NONE, (char *)aux);
1461 
1462 	log_transition(rip, MAINT_REQUESTED);
1463 
1464 	info = startd_zalloc(sizeof (*info));
1465 	info->sf_id = rip->ri_id;
1466 	info->sf_method_type = METHOD_STOP;
1467 	info->sf_event_type = RERR_RESTART;
1468 	rip->ri_method_thread = startd_thread_create(method_thread, info);
1469 }
1470 
1471 static void
1472 refresh_instance(scf_handle_t *h, restarter_inst_t *rip)
1473 {
1474 	scf_instance_t *inst;
1475 	scf_snapshot_t *snap;
1476 	fork_info_t *info;
1477 	int r;
1478 
1479 	assert(PTHREAD_MUTEX_HELD(&rip->ri_lock));
1480 
1481 	log_instance(rip, B_TRUE, "Rereading configuration.");
1482 	log_framework(LOG_DEBUG, "%s: rereading configuration.\n",
1483 	    rip->ri_i.i_fmri);
1484 
1485 rep_retry:
1486 	r = libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst);
1487 	switch (r) {
1488 	case 0:
1489 		break;
1490 
1491 	case ECONNABORTED:
1492 		libscf_handle_rebind(h);
1493 		goto rep_retry;
1494 
1495 	case ENOENT:
1496 		/* Must have been deleted. */
1497 		return;
1498 
1499 	case EINVAL:
1500 	case ENOTSUP:
1501 	default:
1502 		bad_error("libscf_fmri_get_instance", r);
1503 	}
1504 
1505 	snap = libscf_get_running_snapshot(inst);
1506 
1507 	r = libscf_get_startd_properties(inst, snap, &rip->ri_flags,
1508 	    &rip->ri_utmpx_prefix);
1509 	switch (r) {
1510 	case 0:
1511 		log_framework(LOG_DEBUG, "%s is a %s-style service\n",
1512 		    rip->ri_i.i_fmri, service_style(rip->ri_flags));
1513 		break;
1514 
1515 	case ECONNABORTED:
1516 		scf_instance_destroy(inst);
1517 		scf_snapshot_destroy(snap);
1518 		libscf_handle_rebind(h);
1519 		goto rep_retry;
1520 
1521 	case ECANCELED:
1522 	case ENOENT:
1523 		/* Succeed in anticipation of REMOVE_INSTANCE. */
1524 		break;
1525 
1526 	default:
1527 		bad_error("libscf_get_startd_properties", r);
1528 	}
1529 
1530 	if (instance_started(rip)) {
1531 		/* Refresh does not change the state. */
1532 		(void) restarter_instance_update_states(h, rip,
1533 		    rip->ri_i.i_state, rip->ri_i.i_state, RERR_NONE, NULL);
1534 
1535 		info = startd_zalloc(sizeof (*info));
1536 		info->sf_id = rip->ri_id;
1537 		info->sf_method_type = METHOD_REFRESH;
1538 		info->sf_event_type = RERR_REFRESH;
1539 
1540 		assert(rip->ri_method_thread == 0);
1541 		rip->ri_method_thread =
1542 		    startd_thread_create(method_thread, info);
1543 	}
1544 
1545 	scf_snapshot_destroy(snap);
1546 	scf_instance_destroy(inst);
1547 }
1548 
1549 const char *event_names[] = { "INVALID", "ADD_INSTANCE", "REMOVE_INSTANCE",
1550 	"ENABLE", "DISABLE", "ADMIN_DEGRADED", "ADMIN_REFRESH",
1551 	"ADMIN_RESTART", "ADMIN_MAINT_OFF", "ADMIN_MAINT_ON",
1552 	"ADMIN_MAINT_ON_IMMEDIATE", "STOP", "START", "DEPENDENCY_CYCLE",
1553 	"INVALID_DEPENDENCY", "ADMIN_DISABLE"
1554 };
1555 
1556 /*
1557  * void *restarter_process_events()
1558  *
1559  *   Called in a separate thread to process the events on an instance's
1560  *   queue.  Empties the queue completely, and tries to keep the thread
1561  *   around for a little while after the queue is empty to save on
1562  *   startup costs.
1563  */
1564 static void *
1565 restarter_process_events(void *arg)
1566 {
1567 	scf_handle_t *h;
1568 	restarter_instance_qentry_t *event;
1569 	restarter_inst_t *rip;
1570 	char *fmri = (char *)arg;
1571 	struct timespec to;
1572 
1573 	assert(fmri != NULL);
1574 
1575 	h = libscf_handle_create_bound_loop();
1576 
1577 	/* grab the queue lock */
1578 	rip = inst_lookup_queue(fmri);
1579 	if (rip == NULL)
1580 		goto out;
1581 
1582 again:
1583 
1584 	while ((event = uu_list_first(rip->ri_queue)) != NULL) {
1585 		restarter_inst_t *inst;
1586 
1587 		/* drop the queue lock */
1588 		MUTEX_UNLOCK(&rip->ri_queue_lock);
1589 
1590 		/*
1591 		 * Grab the inst lock -- this waits until any outstanding
1592 		 * method finishes running.
1593 		 */
1594 		inst = inst_lookup_by_name(fmri);
1595 		if (inst == NULL) {
1596 			/* Getting deleted in the middle isn't an error. */
1597 			goto cont;
1598 		}
1599 
1600 		assert(instance_in_transition(inst) == 0);
1601 
1602 		/* process the event */
1603 		switch (event->riq_type) {
1604 		case RESTARTER_EVENT_TYPE_ENABLE:
1605 		case RESTARTER_EVENT_TYPE_DISABLE:
1606 		case RESTARTER_EVENT_TYPE_ADMIN_DISABLE:
1607 			(void) enable_inst(h, inst, event->riq_type);
1608 			break;
1609 
1610 		case RESTARTER_EVENT_TYPE_REMOVE_INSTANCE:
1611 			restarter_delete_inst(inst);
1612 			inst = NULL;
1613 			goto cont;
1614 
1615 		case RESTARTER_EVENT_TYPE_STOP:
1616 			(void) stop_instance(h, inst, RSTOP_DEPENDENCY);
1617 			break;
1618 
1619 		case RESTARTER_EVENT_TYPE_START:
1620 			start_instance(h, inst);
1621 			break;
1622 
1623 		case RESTARTER_EVENT_TYPE_DEPENDENCY_CYCLE:
1624 			maintain_instance(h, inst, 0, "dependency_cycle");
1625 			break;
1626 
1627 		case RESTARTER_EVENT_TYPE_INVALID_DEPENDENCY:
1628 			maintain_instance(h, inst, 0, "invalid_dependency");
1629 			break;
1630 
1631 		case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON:
1632 			maintain_instance(h, inst, 0, "administrative_request");
1633 			break;
1634 
1635 		case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON_IMMEDIATE:
1636 			maintain_instance(h, inst, 1, "administrative_request");
1637 			break;
1638 
1639 		case RESTARTER_EVENT_TYPE_ADMIN_MAINT_OFF:
1640 			unmaintain_instance(h, inst, RUNMAINT_CLEAR);
1641 			break;
1642 
1643 		case RESTARTER_EVENT_TYPE_ADMIN_REFRESH:
1644 			refresh_instance(h, inst);
1645 			break;
1646 
1647 		case RESTARTER_EVENT_TYPE_ADMIN_DEGRADED:
1648 			log_framework(LOG_WARNING, "Restarter: "
1649 			    "%s command (for %s) unimplemented.\n",
1650 			    event_names[event->riq_type], inst->ri_i.i_fmri);
1651 			break;
1652 
1653 		case RESTARTER_EVENT_TYPE_ADMIN_RESTART:
1654 			if (!instance_started(inst)) {
1655 				log_framework(LOG_DEBUG, "Restarter: "
1656 				    "Not restarting %s; not running.\n",
1657 				    inst->ri_i.i_fmri);
1658 			} else {
1659 				/*
1660 				 * Stop the instance.  If it can be restarted,
1661 				 * the graph engine will send a new event.
1662 				 */
1663 				(void) stop_instance(h, inst, RSTOP_RESTART);
1664 			}
1665 			break;
1666 
1667 		case RESTARTER_EVENT_TYPE_ADD_INSTANCE:
1668 		default:
1669 #ifndef NDEBUG
1670 			uu_warn("%s:%d: Bad restarter event %d.  "
1671 			    "Aborting.\n", __FILE__, __LINE__, event->riq_type);
1672 #endif
1673 			abort();
1674 		}
1675 
1676 		assert(inst != NULL);
1677 		MUTEX_UNLOCK(&inst->ri_lock);
1678 
1679 cont:
1680 		/* grab the queue lock */
1681 		rip = inst_lookup_queue(fmri);
1682 		if (rip == NULL)
1683 			goto out;
1684 
1685 		/* delete the event */
1686 		uu_list_remove(rip->ri_queue, event);
1687 		startd_free(event, sizeof (restarter_instance_qentry_t));
1688 	}
1689 
1690 	assert(rip != NULL);
1691 
1692 	/*
1693 	 * Try to preserve the thread for a little while for future use.
1694 	 */
1695 	to.tv_sec = 3;
1696 	to.tv_nsec = 0;
1697 	(void) pthread_cond_reltimedwait_np(&rip->ri_queue_cv,
1698 	    &rip->ri_queue_lock, &to);
1699 
1700 	if (uu_list_first(rip->ri_queue) != NULL)
1701 		goto again;
1702 
1703 	rip->ri_queue_thread = 0;
1704 	MUTEX_UNLOCK(&rip->ri_queue_lock);
1705 out:
1706 	(void) scf_handle_unbind(h);
1707 	scf_handle_destroy(h);
1708 	free(fmri);
1709 	return (NULL);
1710 }
1711 
1712 static int
1713 is_admin_event(restarter_event_type_t t) {
1714 
1715 	switch (t) {
1716 	case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON:
1717 	case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON_IMMEDIATE:
1718 	case RESTARTER_EVENT_TYPE_ADMIN_MAINT_OFF:
1719 	case RESTARTER_EVENT_TYPE_ADMIN_REFRESH:
1720 	case RESTARTER_EVENT_TYPE_ADMIN_DEGRADED:
1721 	case RESTARTER_EVENT_TYPE_ADMIN_RESTART:
1722 		return (1);
1723 	default:
1724 		return (0);
1725 	}
1726 }
1727 
1728 static void
1729 restarter_queue_event(restarter_inst_t *ri, restarter_protocol_event_t *e)
1730 {
1731 	restarter_instance_qentry_t *qe;
1732 	int r;
1733 
1734 	assert(PTHREAD_MUTEX_HELD(&ri->ri_queue_lock));
1735 	assert(!PTHREAD_MUTEX_HELD(&ri->ri_lock));
1736 
1737 	qe = startd_zalloc(sizeof (restarter_instance_qentry_t));
1738 	qe->riq_type = e->rpe_type;
1739 
1740 	uu_list_node_init(qe, &qe->riq_link, restarter_queue_pool);
1741 	r = uu_list_insert_before(ri->ri_queue, NULL, qe);
1742 	assert(r == 0);
1743 }
1744 
1745 /*
1746  * void *restarter_event_thread()
1747  *
1748  *  Handle incoming graph events by placing them on a per-instance
1749  *  queue.  We can't lock the main part of the instance structure, so
1750  *  just modify the seprarately locked event queue portion.
1751  */
1752 /*ARGSUSED*/
1753 static void *
1754 restarter_event_thread(void *unused)
1755 {
1756 	scf_handle_t *h;
1757 
1758 	/*
1759 	 * This is a new thread, and thus, gets its own handle
1760 	 * to the repository.
1761 	 */
1762 	h = libscf_handle_create_bound_loop();
1763 
1764 	MUTEX_LOCK(&ru->restarter_update_lock);
1765 
1766 	/*CONSTCOND*/
1767 	while (1) {
1768 		restarter_protocol_event_t *e;
1769 
1770 		while (ru->restarter_update_wakeup == 0)
1771 			(void) pthread_cond_wait(&ru->restarter_update_cv,
1772 			    &ru->restarter_update_lock);
1773 
1774 		ru->restarter_update_wakeup = 0;
1775 
1776 		while ((e = restarter_event_dequeue()) != NULL) {
1777 			restarter_inst_t *rip;
1778 			char *fmri;
1779 
1780 			MUTEX_UNLOCK(&ru->restarter_update_lock);
1781 
1782 			/*
1783 			 * ADD_INSTANCE is special: there's likely no
1784 			 * instance structure yet, so we need to handle the
1785 			 * addition synchronously.
1786 			 */
1787 			switch (e->rpe_type) {
1788 			case RESTARTER_EVENT_TYPE_ADD_INSTANCE:
1789 				if (restarter_insert_inst(h, e->rpe_inst) != 0)
1790 					log_error(LOG_INFO, "Restarter: "
1791 					    "Could not add %s.\n", e->rpe_inst);
1792 
1793 				MUTEX_LOCK(&st->st_load_lock);
1794 				if (--st->st_load_instances == 0)
1795 					(void) pthread_cond_broadcast(
1796 					    &st->st_load_cv);
1797 				MUTEX_UNLOCK(&st->st_load_lock);
1798 
1799 				goto nolookup;
1800 			}
1801 
1802 			/*
1803 			 * Lookup the instance, locking only the event queue.
1804 			 * Can't grab ri_lock here because it might be held
1805 			 * by a long-running method.
1806 			 */
1807 			rip = inst_lookup_queue(e->rpe_inst);
1808 			if (rip == NULL) {
1809 				log_error(LOG_INFO, "Restarter: "
1810 				    "Ignoring %s command for unknown service "
1811 				    "%s.\n", event_names[e->rpe_type],
1812 				    e->rpe_inst);
1813 				goto nolookup;
1814 			}
1815 
1816 			/* Keep ADMIN events from filling up the queue. */
1817 			if (is_admin_event(e->rpe_type) &&
1818 			    uu_list_numnodes(rip->ri_queue) >
1819 			    RINST_QUEUE_THRESHOLD) {
1820 				MUTEX_UNLOCK(&rip->ri_queue_lock);
1821 				log_instance(rip, B_TRUE, "Instance event "
1822 				    "queue overflow.  Dropping administrative "
1823 				    "request.");
1824 				log_framework(LOG_DEBUG, "%s: Instance event "
1825 				    "queue overflow.  Dropping administrative "
1826 				    "request.\n", rip->ri_i.i_fmri);
1827 				goto nolookup;
1828 			}
1829 
1830 			/* Now add the event to the instance queue. */
1831 			restarter_queue_event(rip, e);
1832 
1833 			if (rip->ri_queue_thread == 0) {
1834 				/*
1835 				 * Start a thread if one isn't already
1836 				 * running.
1837 				 */
1838 				fmri = safe_strdup(e->rpe_inst);
1839 				rip->ri_queue_thread =  startd_thread_create(
1840 				    restarter_process_events, (void *)fmri);
1841 			} else {
1842 				/*
1843 				 * Signal the existing thread that there's
1844 				 * a new event.
1845 				 */
1846 				(void) pthread_cond_broadcast(
1847 				    &rip->ri_queue_cv);
1848 			}
1849 
1850 			MUTEX_UNLOCK(&rip->ri_queue_lock);
1851 nolookup:
1852 			restarter_event_release(e);
1853 
1854 			MUTEX_LOCK(&ru->restarter_update_lock);
1855 		}
1856 	}
1857 
1858 	/*
1859 	 * Unreachable for now -- there's currently no graceful cleanup
1860 	 * called on exit().
1861 	 */
1862 	(void) scf_handle_unbind(h);
1863 	scf_handle_destroy(h);
1864 	return (NULL);
1865 }
1866 
1867 static restarter_inst_t *
1868 contract_to_inst(ctid_t ctid)
1869 {
1870 	restarter_inst_t *inst;
1871 	int id;
1872 
1873 	id = lookup_inst_by_contract(ctid);
1874 	if (id == -1)
1875 		return (NULL);
1876 
1877 	inst = inst_lookup_by_id(id);
1878 	if (inst != NULL) {
1879 		/*
1880 		 * Since ri_lock isn't held by the contract id lookup, this
1881 		 * instance may have been restarted and now be in a new
1882 		 * contract, making the old contract no longer valid for this
1883 		 * instance.
1884 		 */
1885 		if (ctid != inst->ri_i.i_primary_ctid) {
1886 			MUTEX_UNLOCK(&inst->ri_lock);
1887 			inst = NULL;
1888 		}
1889 	}
1890 	return (inst);
1891 }
1892 
1893 /*
1894  * void contract_action()
1895  *   Take action on contract events.
1896  */
1897 static void
1898 contract_action(scf_handle_t *h, restarter_inst_t *inst, ctid_t id,
1899     uint32_t type)
1900 {
1901 	const char *fmri = inst->ri_i.i_fmri;
1902 
1903 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
1904 
1905 	/*
1906 	 * If startd has stopped this contract, there is no need to
1907 	 * stop it again.
1908 	 */
1909 	if (inst->ri_i.i_primary_ctid > 0 &&
1910 	    inst->ri_i.i_primary_ctid_stopped)
1911 		return;
1912 
1913 	if ((type & (CT_PR_EV_EMPTY | CT_PR_EV_CORE | CT_PR_EV_SIGNAL
1914 	    | CT_PR_EV_HWERR)) == 0) {
1915 		/*
1916 		 * There shouldn't be other events, since that's not how we set
1917 		 * the terms. Thus, just log an error and drive on.
1918 		 */
1919 		log_framework(LOG_NOTICE,
1920 		    "%s: contract %ld received unexpected critical event "
1921 		    "(%d)\n", fmri, id, type);
1922 		return;
1923 	}
1924 
1925 	assert(instance_in_transition(inst) == 0);
1926 
1927 	if (instance_is_wait_style(inst)) {
1928 		/*
1929 		 * We ignore all events; if they impact the
1930 		 * process we're monitoring, then the
1931 		 * wait_thread will stop the instance.
1932 		 */
1933 		log_framework(LOG_DEBUG,
1934 		    "%s: ignoring contract event on wait-style service\n",
1935 		    fmri);
1936 	} else {
1937 		/*
1938 		 * A CT_PR_EV_EMPTY event is an RSTOP_EXIT request.
1939 		 */
1940 		switch (type) {
1941 		case CT_PR_EV_EMPTY:
1942 			(void) stop_instance(h, inst, RSTOP_EXIT);
1943 			break;
1944 		case CT_PR_EV_CORE:
1945 			(void) stop_instance(h, inst, RSTOP_CORE);
1946 			break;
1947 		case CT_PR_EV_SIGNAL:
1948 			(void) stop_instance(h, inst, RSTOP_SIGNAL);
1949 			break;
1950 		case CT_PR_EV_HWERR:
1951 			(void) stop_instance(h, inst, RSTOP_HWERR);
1952 			break;
1953 		}
1954 	}
1955 }
1956 
1957 /*
1958  * void *restarter_contract_event_thread(void *)
1959  *   Listens to the process contract bundle for critical events, taking action
1960  *   on events from contracts we know we are responsible for.
1961  */
1962 /*ARGSUSED*/
1963 static void *
1964 restarter_contracts_event_thread(void *unused)
1965 {
1966 	int fd, err;
1967 	scf_handle_t *local_handle;
1968 
1969 	/*
1970 	 * Await graph load completion.  That is, stop here, until we've scanned
1971 	 * the repository for contract - instance associations.
1972 	 */
1973 	MUTEX_LOCK(&st->st_load_lock);
1974 	while (!(st->st_load_complete && st->st_load_instances == 0))
1975 		(void) pthread_cond_wait(&st->st_load_cv, &st->st_load_lock);
1976 	MUTEX_UNLOCK(&st->st_load_lock);
1977 
1978 	/*
1979 	 * This is a new thread, and thus, gets its own handle
1980 	 * to the repository.
1981 	 */
1982 	if ((local_handle = libscf_handle_create_bound(SCF_VERSION)) == NULL)
1983 		uu_die("Unable to bind a new repository handle: %s\n",
1984 		    scf_strerror(scf_error()));
1985 
1986 	fd = open64(CTFS_ROOT "/process/pbundle", O_RDONLY);
1987 	if (fd == -1)
1988 		uu_die("process bundle open failed");
1989 
1990 	/*
1991 	 * Make sure we get all events (including those generated by configd
1992 	 * before this thread was started).
1993 	 */
1994 	err = ct_event_reset(fd);
1995 	assert(err == 0);
1996 
1997 	for (;;) {
1998 		int efd, sfd;
1999 		ct_evthdl_t ev;
2000 		uint32_t type;
2001 		ctevid_t evid;
2002 		ct_stathdl_t status;
2003 		ctid_t ctid;
2004 		restarter_inst_t *inst;
2005 		uint64_t cookie;
2006 
2007 		if (err = ct_event_read_critical(fd, &ev)) {
2008 			log_error(LOG_WARNING,
2009 			    "Error reading next contract event: %s",
2010 			    strerror(err));
2011 			continue;
2012 		}
2013 
2014 		evid = ct_event_get_evid(ev);
2015 		ctid = ct_event_get_ctid(ev);
2016 		type = ct_event_get_type(ev);
2017 
2018 		/* Fetch cookie. */
2019 		if ((sfd = contract_open(ctid, "process", "status", O_RDONLY))
2020 		    < 0) {
2021 			ct_event_free(ev);
2022 			continue;
2023 		}
2024 
2025 		if (err = ct_status_read(sfd, CTD_COMMON, &status)) {
2026 			log_framework(LOG_WARNING, "Could not get status for "
2027 			    "contract %ld: %s\n", ctid, strerror(err));
2028 
2029 			startd_close(sfd);
2030 			ct_event_free(ev);
2031 			continue;
2032 		}
2033 
2034 		cookie = ct_status_get_cookie(status);
2035 
2036 		log_framework(LOG_DEBUG, "Received event %d for ctid %ld "
2037 		    "cookie %lld\n", type, ctid, cookie);
2038 
2039 		ct_status_free(status);
2040 
2041 		startd_close(sfd);
2042 
2043 		/*
2044 		 * svc.configd(1M) restart handling performed by the
2045 		 * fork_configd_thread.  We don't acknowledge, as that thread
2046 		 * will do so.
2047 		 */
2048 		if (cookie == CONFIGD_COOKIE) {
2049 			ct_event_free(ev);
2050 			continue;
2051 		}
2052 
2053 		inst = NULL;
2054 		if (storing_contract != 0 &&
2055 		    (inst = contract_to_inst(ctid)) == NULL) {
2056 			/*
2057 			 * This can happen for two reasons:
2058 			 * - method_run() has not yet stored the
2059 			 *    the contract into the internal hash table.
2060 			 * - we receive an EMPTY event for an abandoned
2061 			 *    contract.
2062 			 * If there is any contract in the process of
2063 			 * being stored into the hash table then re-read
2064 			 * the event later.
2065 			 */
2066 			log_framework(LOG_DEBUG,
2067 			    "Reset event %d for unknown "
2068 			    "contract id %ld\n", type, ctid);
2069 
2070 			/* don't go too fast */
2071 			(void) poll(NULL, 0, 100);
2072 
2073 			(void) ct_event_reset(fd);
2074 			ct_event_free(ev);
2075 			continue;
2076 		}
2077 
2078 		/*
2079 		 * Do not call contract_to_inst() again if first
2080 		 * call succeeded.
2081 		 */
2082 		if (inst == NULL)
2083 			inst = contract_to_inst(ctid);
2084 		if (inst == NULL) {
2085 			/*
2086 			 * This can happen if we receive an EMPTY
2087 			 * event for an abandoned contract.
2088 			 */
2089 			log_framework(LOG_DEBUG,
2090 			    "Received event %d for unknown contract id "
2091 			    "%ld\n", type, ctid);
2092 		} else {
2093 			log_framework(LOG_DEBUG,
2094 			    "Received event %d for contract id "
2095 			    "%ld (%s)\n", type, ctid,
2096 			    inst->ri_i.i_fmri);
2097 
2098 			contract_action(local_handle, inst, ctid, type);
2099 
2100 			MUTEX_UNLOCK(&inst->ri_lock);
2101 		}
2102 
2103 		efd = contract_open(ct_event_get_ctid(ev), "process", "ctl",
2104 		    O_WRONLY);
2105 		if (efd != -1) {
2106 			(void) ct_ctl_ack(efd, evid);
2107 			startd_close(efd);
2108 		}
2109 
2110 		ct_event_free(ev);
2111 
2112 	}
2113 
2114 	/*NOTREACHED*/
2115 	return (NULL);
2116 }
2117 
2118 /*
2119  * Timeout queue, processed by restarter_timeouts_event_thread().
2120  */
2121 timeout_queue_t *timeouts;
2122 static uu_list_pool_t *timeout_pool;
2123 
2124 typedef struct timeout_update {
2125 	pthread_mutex_t		tu_lock;
2126 	pthread_cond_t		tu_cv;
2127 	int			tu_wakeup;
2128 } timeout_update_t;
2129 
2130 timeout_update_t *tu;
2131 
2132 static const char *timeout_ovr_svcs[] = {
2133 	"svc:/system/manifest-import:default",
2134 	"svc:/network/initial:default",
2135 	"svc:/network/service:default",
2136 	"svc:/system/rmtmpfiles:default",
2137 	"svc:/network/loopback:default",
2138 	"svc:/network/physical:default",
2139 	"svc:/system/device/local:default",
2140 	"svc:/system/metainit:default",
2141 	"svc:/system/filesystem/usr:default",
2142 	"svc:/system/filesystem/minimal:default",
2143 	"svc:/system/filesystem/local:default",
2144 	NULL
2145 };
2146 
2147 int
2148 is_timeout_ovr(restarter_inst_t *inst)
2149 {
2150 	int i;
2151 
2152 	for (i = 0; timeout_ovr_svcs[i] != NULL; ++i) {
2153 		if (strcmp(inst->ri_i.i_fmri, timeout_ovr_svcs[i]) == 0) {
2154 			log_instance(inst, B_TRUE, "Timeout override by "
2155 			    "svc.startd.  Using infinite timeout.");
2156 			return (1);
2157 		}
2158 	}
2159 
2160 	return (0);
2161 }
2162 
2163 /*ARGSUSED*/
2164 static int
2165 timeout_compare(const void *lc_arg, const void *rc_arg, void *private)
2166 {
2167 	hrtime_t t1 = ((const timeout_entry_t *)lc_arg)->te_timeout;
2168 	hrtime_t t2 = ((const timeout_entry_t *)rc_arg)->te_timeout;
2169 
2170 	if (t1 > t2)
2171 		return (1);
2172 	else if (t1 < t2)
2173 		return (-1);
2174 	return (0);
2175 }
2176 
2177 void
2178 timeout_init()
2179 {
2180 	timeouts = startd_zalloc(sizeof (timeout_queue_t));
2181 
2182 	(void) pthread_mutex_init(&timeouts->tq_lock, &mutex_attrs);
2183 
2184 	timeout_pool = startd_list_pool_create("timeouts",
2185 	    sizeof (timeout_entry_t), offsetof(timeout_entry_t, te_link),
2186 	    timeout_compare, UU_LIST_POOL_DEBUG);
2187 	assert(timeout_pool != NULL);
2188 
2189 	timeouts->tq_list = startd_list_create(timeout_pool,
2190 	    timeouts, UU_LIST_SORTED);
2191 	assert(timeouts->tq_list != NULL);
2192 
2193 	tu = startd_zalloc(sizeof (timeout_update_t));
2194 	(void) pthread_cond_init(&tu->tu_cv, NULL);
2195 	(void) pthread_mutex_init(&tu->tu_lock, &mutex_attrs);
2196 }
2197 
2198 void
2199 timeout_insert(restarter_inst_t *inst, ctid_t cid, uint64_t timeout_sec)
2200 {
2201 	hrtime_t now, timeout;
2202 	timeout_entry_t *entry;
2203 	uu_list_index_t idx;
2204 
2205 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
2206 
2207 	now = gethrtime();
2208 
2209 	/*
2210 	 * If we overflow LLONG_MAX, we're never timing out anyways, so
2211 	 * just return.
2212 	 */
2213 	if (timeout_sec >= (LLONG_MAX - now) / 1000000000LL) {
2214 		log_instance(inst, B_TRUE, "timeout_seconds too large, "
2215 		    "treating as infinite.");
2216 		return;
2217 	}
2218 
2219 	/* hrtime is in nanoseconds. Convert timeout_sec. */
2220 	timeout = now + (timeout_sec * 1000000000LL);
2221 
2222 	entry = startd_alloc(sizeof (timeout_entry_t));
2223 	entry->te_timeout = timeout;
2224 	entry->te_ctid = cid;
2225 	entry->te_fmri = safe_strdup(inst->ri_i.i_fmri);
2226 	entry->te_logstem = safe_strdup(inst->ri_logstem);
2227 	entry->te_fired = 0;
2228 	/* Insert the calculated timeout time onto the queue. */
2229 	MUTEX_LOCK(&timeouts->tq_lock);
2230 	(void) uu_list_find(timeouts->tq_list, entry, NULL, &idx);
2231 	uu_list_node_init(entry, &entry->te_link, timeout_pool);
2232 	uu_list_insert(timeouts->tq_list, entry, idx);
2233 	MUTEX_UNLOCK(&timeouts->tq_lock);
2234 
2235 	assert(inst->ri_timeout == NULL);
2236 	inst->ri_timeout = entry;
2237 
2238 	MUTEX_LOCK(&tu->tu_lock);
2239 	tu->tu_wakeup = 1;
2240 	(void) pthread_cond_broadcast(&tu->tu_cv);
2241 	MUTEX_UNLOCK(&tu->tu_lock);
2242 }
2243 
2244 
2245 void
2246 timeout_remove(restarter_inst_t *inst, ctid_t cid)
2247 {
2248 	assert(PTHREAD_MUTEX_HELD(&inst->ri_lock));
2249 
2250 	if (inst->ri_timeout == NULL)
2251 		return;
2252 
2253 	assert(inst->ri_timeout->te_ctid == cid);
2254 
2255 	MUTEX_LOCK(&timeouts->tq_lock);
2256 	uu_list_remove(timeouts->tq_list, inst->ri_timeout);
2257 	MUTEX_UNLOCK(&timeouts->tq_lock);
2258 
2259 	free(inst->ri_timeout->te_fmri);
2260 	free(inst->ri_timeout->te_logstem);
2261 	startd_free(inst->ri_timeout, sizeof (timeout_entry_t));
2262 	inst->ri_timeout = NULL;
2263 }
2264 
2265 static int
2266 timeout_now()
2267 {
2268 	timeout_entry_t *e;
2269 	hrtime_t now;
2270 	int ret;
2271 
2272 	now = gethrtime();
2273 
2274 	/*
2275 	 * Walk through the (sorted) timeouts list.  While the timeout
2276 	 * at the head of the list is <= the current time, kill the
2277 	 * method.
2278 	 */
2279 	MUTEX_LOCK(&timeouts->tq_lock);
2280 
2281 	for (e = uu_list_first(timeouts->tq_list);
2282 	    e != NULL && e->te_timeout <= now;
2283 	    e = uu_list_next(timeouts->tq_list, e)) {
2284 		log_framework(LOG_WARNING, "%s: Method or service exit timed "
2285 		    "out.  Killing contract %ld.\n", e->te_fmri, e->te_ctid);
2286 		log_instance_fmri(e->te_fmri, e->te_logstem, B_TRUE,
2287 		    "Method or service exit timed out.  Killing contract %ld.",
2288 		    e->te_ctid);
2289 		e->te_fired = 1;
2290 		(void) contract_kill(e->te_ctid, SIGKILL, e->te_fmri);
2291 	}
2292 
2293 	if (uu_list_numnodes(timeouts->tq_list) > 0)
2294 		ret = 0;
2295 	else
2296 		ret = -1;
2297 
2298 	MUTEX_UNLOCK(&timeouts->tq_lock);
2299 
2300 	return (ret);
2301 }
2302 
2303 /*
2304  * void *restarter_timeouts_event_thread(void *)
2305  *   Responsible for monitoring the method timeouts.  This thread must
2306  *   be started before any methods are called.
2307  */
2308 /*ARGSUSED*/
2309 static void *
2310 restarter_timeouts_event_thread(void *unused)
2311 {
2312 	/*
2313 	 * Timeouts are entered on a priority queue, which is processed by
2314 	 * this thread.  As timeouts are specified in seconds, we'll do
2315 	 * the necessary processing every second, as long as the queue
2316 	 * is not empty.
2317 	 */
2318 
2319 	/*CONSTCOND*/
2320 	while (1) {
2321 		/*
2322 		 * As long as the timeout list isn't empty, process it
2323 		 * every second.
2324 		 */
2325 		if (timeout_now() == 0) {
2326 			(void) sleep(1);
2327 			continue;
2328 		}
2329 
2330 		/* The list is empty, wait until we have more timeouts. */
2331 		MUTEX_LOCK(&tu->tu_lock);
2332 
2333 		while (tu->tu_wakeup == 0)
2334 			(void) pthread_cond_wait(&tu->tu_cv, &tu->tu_lock);
2335 
2336 		tu->tu_wakeup = 0;
2337 		MUTEX_UNLOCK(&tu->tu_lock);
2338 	}
2339 
2340 	return (NULL);
2341 }
2342 
2343 void
2344 restarter_start()
2345 {
2346 	(void) startd_thread_create(restarter_timeouts_event_thread, NULL);
2347 	(void) startd_thread_create(restarter_event_thread, NULL);
2348 	(void) startd_thread_create(restarter_contracts_event_thread, NULL);
2349 	(void) startd_thread_create(wait_thread, NULL);
2350 }
2351 
2352 
2353 void
2354 restarter_init()
2355 {
2356 	restarter_instance_pool = startd_list_pool_create("restarter_instances",
2357 	    sizeof (restarter_inst_t), offsetof(restarter_inst_t,
2358 	    ri_link), restarter_instance_compare, UU_LIST_POOL_DEBUG);
2359 	(void) memset(&instance_list, 0, sizeof (instance_list));
2360 
2361 	(void) pthread_mutex_init(&instance_list.ril_lock, &mutex_attrs);
2362 	instance_list.ril_instance_list = startd_list_create(
2363 	    restarter_instance_pool, &instance_list, UU_LIST_SORTED);
2364 
2365 	restarter_queue_pool = startd_list_pool_create(
2366 	    "restarter_instance_queue", sizeof (restarter_instance_qentry_t),
2367 	    offsetof(restarter_instance_qentry_t,  riq_link), NULL,
2368 	    UU_LIST_POOL_DEBUG);
2369 
2370 	contract_list_pool = startd_list_pool_create(
2371 	    "contract_list", sizeof (contract_entry_t),
2372 	    offsetof(contract_entry_t,  ce_link), NULL,
2373 	    UU_LIST_POOL_DEBUG);
2374 	contract_hash_init();
2375 
2376 	log_framework(LOG_DEBUG, "Initialized restarter\n");
2377 }
2378