xref: /titanic_52/usr/src/cmd/rcm_daemon/common/rcm_lock.c (revision 03831d35f7499c87d51205817c93e9a8d42c4bae)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  *
22  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include "rcm_impl.h"
29 #include "rcm_module.h"
30 
31 /*
32  * Global locks
33  */
34 mutex_t rcm_req_lock;	/* protects global dr & info request list */
35 
36 /*
37  * Daemon state file
38  */
39 static int state_fd;
40 #define	RCM_STATE_FILE	"/var/run/rcm_daemon_state"
41 #define	N_REQ_CHUNK	10	/* grow 10 entries at a time */
42 
43 /*
44  * Daemon timeout value
45  */
46 #define	RCM_DAEMON_TIMEOUT	300	/* 5 minutes idle time */
47 
48 /*
49  * Struct for a list of outstanding rcm requests
50  */
51 typedef struct {
52 	int	seq_num;		/* sequence number of request */
53 	int	state;			/* current state */
54 	pid_t	pid;			/* pid of initiator */
55 	uint_t	flag;			/* request flags */
56 	int	type;			/* resource(device) type */
57 	timespec_t interval;		/* suspend interval */
58 	char	device[MAXPATHLEN];	/* name of device or resource */
59 } req_t;
60 
61 typedef struct {
62 	int	n_req;
63 	int	n_req_max;	/* number of req_t's to follow */
64 	int	n_seq_max;	/* last sequence number */
65 	int	idle_timeout;	/* persist idle timeout value */
66 	req_t	req[1];
67 	/* more req_t follows */
68 } req_list_t;
69 
70 static req_list_t *dr_req_list;
71 static req_list_t *info_req_list;
72 
73 static const char *locked_info = "DR operation in progress";
74 static const char *locked_err = "Resource is busy";
75 
76 static int rcmd_get_state();
77 static void add_to_polling_list(pid_t);
78 static void remove_from_polling_list(pid_t);
79 
80 void start_polling_thread();
81 static void stop_polling_thread();
82 
83 /*
84  * Initialize request lists required for locking
85  */
86 void
87 rcmd_lock_init(void)
88 {
89 	int size;
90 	struct stat fbuf;
91 
92 	/*
93 	 * Start info list with one slot, then grow on demand.
94 	 */
95 	info_req_list = s_calloc(1, sizeof (req_list_t));
96 	info_req_list->n_req_max = 1;
97 
98 	/*
99 	 * Open daemon state file and map in contents
100 	 */
101 	state_fd = open(RCM_STATE_FILE, O_CREAT|O_RDWR, 0600);
102 	if (state_fd == -1) {
103 		rcm_log_message(RCM_ERROR, gettext("cannot open %s: %s\n"),
104 		    RCM_STATE_FILE, strerror(errno));
105 		rcmd_exit(errno);
106 	}
107 
108 	if (fstat(state_fd, &fbuf) != 0) {
109 		rcm_log_message(RCM_ERROR, gettext("cannot stat %s: %s\n"),
110 		    RCM_STATE_FILE, strerror(errno));
111 		rcmd_exit(errno);
112 	}
113 
114 	size = fbuf.st_size;
115 	if (size == 0) {
116 		size = sizeof (req_list_t);
117 		if (ftruncate(state_fd, size) != 0) {
118 			rcm_log_message(RCM_ERROR,
119 			    gettext("cannot truncate %s: %s\n"),
120 			    RCM_STATE_FILE, strerror(errno));
121 			rcmd_exit(errno);
122 		}
123 	}
124 
125 	/*LINTED*/
126 	dr_req_list = (req_list_t *)mmap(NULL, size, PROT_READ|PROT_WRITE,
127 	    MAP_SHARED, state_fd, 0);
128 	if (dr_req_list == MAP_FAILED) {
129 		rcm_log_message(RCM_ERROR, gettext("cannot mmap %s: %s\n"),
130 		    RCM_STATE_FILE, strerror(errno));
131 		rcmd_exit(errno);
132 	}
133 
134 	/*
135 	 * Initial size is one entry
136 	 */
137 	if (dr_req_list->n_req_max == 0) {
138 		dr_req_list->n_req_max = 1;
139 		(void) fsync(state_fd);
140 		return;
141 	}
142 
143 	rcm_log_message(RCM_DEBUG, "n_req = %d, n_req_max = %d\n",
144 	    dr_req_list->n_req, dr_req_list->n_req_max);
145 
146 	/*
147 	 * Recover the daemon state
148 	 */
149 	clean_dr_list();
150 }
151 
152 /*
153  * Get a unique sequence number--to be called with rcm_req_lock held.
154  */
155 static int
156 get_seq_number()
157 {
158 	int number;
159 
160 	if (dr_req_list == NULL)
161 		return (0);
162 
163 	dr_req_list->n_seq_max++;
164 	number  = (dr_req_list->n_seq_max << SEQ_NUM_SHIFT);
165 	(void) fsync(state_fd);
166 
167 	return (number);
168 }
169 
170 /*
171  * Find entry in list with the same resource name and sequence number.
172  * If seq_num == -1, no seq_num matching is required.
173  */
174 static req_t *
175 find_req_entry(char *device, uint_t flag, int seq_num, req_list_t *list)
176 {
177 	int i;
178 
179 	/*
180 	 * Look for entry with the same resource and seq_num.
181 	 * Also match RCM_FILESYS field in flag.
182 	 */
183 	for (i = 0; i < list->n_req_max; i++) {
184 		if (list->req[i].state == RCM_STATE_REMOVE)
185 			/* stale entry */
186 			continue;
187 		/*
188 		 * We need to distiguish a file system root from the directory
189 		 * it is mounted on.
190 		 *
191 		 * Applications are not aware of any difference between the
192 		 * two, but the system keeps track of it internally by
193 		 * checking for mount points while traversing file path.
194 		 * In a similar spirit, RCM is keeping this difference as
195 		 * an implementation detail.
196 		 */
197 		if ((strcmp(device, list->req[i].device) != 0) ||
198 		    (list->req[i].flag & RCM_FILESYS) != (flag & RCM_FILESYS))
199 			/* different resource */
200 			continue;
201 
202 		if ((seq_num != -1) && ((seq_num >> SEQ_NUM_SHIFT) !=
203 		    (list->req[i].seq_num >> SEQ_NUM_SHIFT)))
204 			/* different base seqnum */
205 			continue;
206 
207 		return (&list->req[i]);
208 	}
209 
210 	return (NULL);
211 }
212 
213 /*
214  * Get the next empty req_t entry. If no entry exists, grow the list.
215  */
216 static req_t *
217 get_req_entry(req_list_t **listp)
218 {
219 	int i;
220 	int n_req = (*listp)->n_req;
221 	int n_req_max = (*listp)->n_req_max;
222 
223 	/*
224 	 * If the list is full, grow the list and return the first
225 	 * entry in the new portion.
226 	 */
227 	if (n_req == n_req_max) {
228 		int newsize;
229 
230 		n_req_max += N_REQ_CHUNK;
231 		newsize = sizeof (req_list_t) + (n_req_max - 1) *
232 		    sizeof (req_t);
233 
234 		if (listp == &info_req_list) {
235 			*listp = s_realloc(*listp, newsize);
236 		} else if (ftruncate(state_fd, newsize) != 0) {
237 			rcm_log_message(RCM_ERROR,
238 			    gettext("cannot truncate %s: %s\n"),
239 			    RCM_STATE_FILE, strerror(errno));
240 			rcmd_exit(errno);
241 		/*LINTED*/
242 		} else if ((*listp = (req_list_t *)mmap(NULL, newsize,
243 		    PROT_READ|PROT_WRITE, MAP_SHARED, state_fd, 0)) ==
244 		    MAP_FAILED) {
245 			rcm_log_message(RCM_ERROR,
246 			    gettext("cannot mmap %s: %s\n"),
247 			    RCM_STATE_FILE, strerror(errno));
248 			rcmd_exit(errno);
249 		}
250 
251 		/* Initialize the new entries */
252 		for (i = (*listp)->n_req_max; i < n_req_max; i++) {
253 			(*listp)->req[i].state = RCM_STATE_REMOVE;
254 			(void) strcpy((*listp)->req[i].device, "");
255 		}
256 
257 		(*listp)->n_req_max = n_req_max;
258 		(*listp)->n_req++;
259 		return (&(*listp)->req[n_req]);
260 	}
261 
262 	/*
263 	 * List contains empty slots, find it.
264 	 */
265 	for (i = 0; i < n_req_max; i++) {
266 		if (((*listp)->req[i].device[0] == '\0') ||
267 		    ((*listp)->req[i].state == RCM_STATE_REMOVE)) {
268 			break;
269 		}
270 	}
271 
272 	assert(i < n_req_max);	/* empty slot must exist */
273 
274 	(*listp)->n_req++;
275 	return (&(*listp)->req[i]);
276 }
277 
278 /*
279  * When one resource depends on multiple resources, it's possible that
280  * rcm_get_info can be called multiple times on the resource, resulting
281  * in duplicate information. By assigning a unique sequence number to
282  * each rcm_get_info operation, this duplication can be eliminated.
283  *
284  * Insert a dr entry in info_req_list
285  */
286 int
287 info_req_add(char *rsrcname, uint_t flag, int seq_num)
288 {
289 	int error = 0;
290 	char *device;
291 	req_t *req;
292 
293 	rcm_log_message(RCM_TRACE2, "info_req_add(%s, %d)\n",
294 	    rsrcname, seq_num);
295 
296 	device = resolve_name(rsrcname);
297 	(void) mutex_lock(&rcm_req_lock);
298 
299 	/*
300 	 * Look for entry with the same resource and seq_num.
301 	 * If it exists, we return an error so that such
302 	 * information is not gathered more than once.
303 	 */
304 	if (find_req_entry(device, flag, seq_num, info_req_list) != NULL) {
305 		rcm_log_message(RCM_DEBUG, "getinfo cycle: %s %d \n",
306 		    device, seq_num);
307 		error = -1;
308 		goto out;
309 	}
310 
311 	/*
312 	 * Get empty entry and fill in seq_num and device.
313 	 */
314 	req = get_req_entry(&info_req_list);
315 	req->seq_num = seq_num;
316 	req->state = RCM_STATE_ONLINE;  /* mark that the entry is in use */
317 	req->flag = flag;
318 	(void) strcpy(req->device, device);
319 
320 out:
321 	(void) mutex_unlock(&rcm_req_lock);
322 	free(device);
323 
324 	return (error);
325 }
326 
327 /*
328  * Remove all entries associated with seq_num from info_req_list
329  */
330 void
331 info_req_remove(int seq_num)
332 {
333 	int i;
334 
335 	rcm_log_message(RCM_TRACE3, "info_req_remove(%d)\n", seq_num);
336 
337 	seq_num >>= SEQ_NUM_SHIFT;
338 	(void) mutex_lock(&rcm_req_lock);
339 
340 	/* remove all entries with seq_num */
341 	for (i = 0; i < info_req_list->n_req_max; i++) {
342 		if (info_req_list->req[i].state == RCM_STATE_REMOVE)
343 			continue;
344 
345 		if ((info_req_list->req[i].seq_num >> SEQ_NUM_SHIFT) != seq_num)
346 			continue;
347 
348 		info_req_list->req[i].state = RCM_STATE_REMOVE;
349 		info_req_list->n_req--;
350 	}
351 
352 	/*
353 	 * We don't shrink the info_req_list size for now.
354 	 */
355 	(void) mutex_unlock(&rcm_req_lock);
356 }
357 
358 /*
359  * Checking lock conflicts. There is a conflict if:
360  * - attempt to DR a node when either its ancester or descendent
361  *	is in the process of DR
362  * - attempt to register for a node when its ancester is locked for DR
363  */
364 static int
365 check_lock(char *device, uint_t flag, int cflag, rcm_info_t **info)
366 {
367 	int i, ret = RCM_SUCCESS;
368 
369 	if (info)
370 		*info = NULL;
371 
372 	/*
373 	 * During daemon initialization, don't check locks
374 	 */
375 	if (dr_req_list == NULL)
376 		return (ret);
377 
378 	for (i = 0; i < dr_req_list->n_req; i++) {
379 		req_t *req = &dr_req_list->req[i];
380 		char *dr_dev = req->device;
381 
382 		/*
383 		 * Skip empty entries
384 		 */
385 		if ((req->state == RCM_STATE_REMOVE) || (dr_dev[0] == '\0'))
386 			continue;
387 
388 		/*
389 		 * Make sure that none of the ancestors of dr_dev is
390 		 * being operated upon.
391 		 */
392 		if (EQUAL(device, dr_dev) || DESCENDENT(device, dr_dev)) {
393 			/*
394 			 * An exception to this is the filesystem.
395 			 * We should allowed a filesystem rooted at a
396 			 * child directory to be unmounted.
397 			 */
398 			if ((flag & RCM_FILESYS) && (!EQUAL(device, dr_dev) ||
399 			    ((dr_req_list->req[i].flag & RCM_FILESYS) == 0)))
400 				continue;
401 
402 			assert(info != 0);
403 
404 			add_busy_rsrc_to_list(dr_dev, dr_req_list->req[i].pid,
405 			    dr_req_list->req[i].state,
406 			    dr_req_list->req[i].seq_num, NULL, locked_info,
407 			    locked_err, NULL, info);
408 			ret = RCM_CONFLICT;
409 			break;
410 		}
411 
412 		if ((cflag == LOCK_FOR_DR) && DESCENDENT(dr_dev, device)) {
413 			/*
414 			 * Check descendents only for DR request.
415 			 *
416 			 * Could have multiple descendents doing DR,
417 			 * we want to find them all.
418 			 */
419 			assert(info != 0);
420 
421 			add_busy_rsrc_to_list(dr_dev, dr_req_list->req[i].pid,
422 			    dr_req_list->req[i].state,
423 			    dr_req_list->req[i].seq_num, NULL, locked_info,
424 			    locked_err, NULL, info);
425 			ret = RCM_CONFLICT;
426 			/* don't break here, need to find all conflicts */
427 		}
428 	}
429 
430 	return (ret);
431 }
432 
433 /*
434  * Check for lock conflicts for DR operation or client registration
435  */
436 int
437 rsrc_check_lock_conflicts(char *rsrcname, uint_t flag, int cflag,
438     rcm_info_t **info)
439 {
440 	int result;
441 	char *device;
442 
443 	device = resolve_name(rsrcname);
444 	result = check_lock(device, flag, cflag, info);
445 	free(device);
446 
447 	return (result);
448 }
449 
450 static int
451 transition_state(int state)
452 {
453 	/*
454 	 * If the resource state is in transition, ask caller to
455 	 * try again.
456 	 */
457 	switch (state) {
458 	case RCM_STATE_OFFLINING:
459 	case RCM_STATE_SUSPENDING:
460 	case RCM_STATE_RESUMING:
461 	case RCM_STATE_ONLINING:
462 	case RCM_STATE_REMOVING:
463 
464 		return (1);
465 
466 	default:
467 		/*FALLTHROUGH*/
468 		break;
469 	}
470 	return (0);
471 }
472 
473 /*
474  * Update a dr entry in dr_req_list
475  */
476 /*ARGSUSED*/
477 static int
478 dr_req_update_entry(char *device, pid_t pid, uint_t flag, int state,
479     int seq_num, timespec_t *interval, rcm_info_t **infop)
480 {
481 	req_t *req;
482 
483 	/*
484 	 * Find request entry. If not found, return RCM_FAILURE
485 	 */
486 	req = find_req_entry(device, flag, -1, dr_req_list);
487 
488 	if (req == NULL) {
489 		switch (state) {
490 		case RCM_STATE_OFFLINE_QUERYING:
491 		case RCM_STATE_SUSPEND_QUERYING:
492 		case RCM_STATE_OFFLINING:
493 		case RCM_STATE_SUSPENDING:
494 			/* could be re-do operation, no error message */
495 			break;
496 
497 		default:
498 			rcm_log_message(RCM_DEBUG,
499 			    "update non-existing resource %s\n", device);
500 		}
501 		return (RCM_FAILURE);
502 	}
503 
504 	/*
505 	 * During initialization, update is unconditional (forced)
506 	 * in order to bring the daemon up in a sane state.
507 	 */
508 	if (rcmd_get_state() == RCMD_INIT)
509 		goto update;
510 
511 	/*
512 	 * Don't allow update with mismatched initiator pid. This could happen
513 	 * as part of normal operation.
514 	 */
515 	if (pid != req->pid) {
516 		rcm_log_message(RCM_INFO,
517 		    gettext("mismatched dr initiator pid: %ld %ld\n"),
518 		    req->pid, pid);
519 		goto failure;
520 	}
521 
522 	rcm_log_message(RCM_TRACE4,
523 	    "dr_req_update_entry: state=%d, device=%s\n",
524 	    req->state, req->device);
525 
526 	/*
527 	 * Check that the state transition is valid
528 	 */
529 	switch (state) {
530 	case RCM_STATE_OFFLINE_QUERYING:
531 	case RCM_STATE_OFFLINING:
532 		/*
533 		 * This is the case of re-offlining, which applies only
534 		 * if a previous attempt failed.
535 		 */
536 		if ((req->state != RCM_STATE_OFFLINE_FAIL) &&
537 		    (req->state != RCM_STATE_OFFLINE_QUERYING) &&
538 		    (req->state != RCM_STATE_OFFLINE_QUERY) &&
539 		    (req->state != RCM_STATE_OFFLINE_QUERY_FAIL) &&
540 		    (req->state != RCM_STATE_OFFLINE)) {
541 			rcm_log_message(RCM_WARNING,
542 			    gettext("%s: invalid offlining from state %d\n"),
543 			    device, req->state);
544 			goto failure;
545 		}
546 		break;
547 
548 	case RCM_STATE_SUSPEND_QUERYING:
549 	case RCM_STATE_SUSPENDING:
550 		/*
551 		 * This is the case of re-suspending, which applies only
552 		 * if a previous attempt failed.
553 		 */
554 		if ((req->state != RCM_STATE_SUSPEND_FAIL) &&
555 		    (req->state != RCM_STATE_SUSPEND_QUERYING) &&
556 		    (req->state != RCM_STATE_SUSPEND_QUERY) &&
557 		    (req->state != RCM_STATE_SUSPEND_QUERY_FAIL) &&
558 		    (req->state != RCM_STATE_SUSPEND)) {
559 			rcm_log_message(RCM_WARNING,
560 			    gettext("%s: invalid suspending from state %d\n"),
561 			    device, req->state);
562 			goto failure;
563 		}
564 		break;
565 
566 	case RCM_STATE_RESUMING:
567 		if ((req->state != RCM_STATE_SUSPEND) &&
568 		    (req->state != RCM_STATE_SUSPEND_QUERYING) &&
569 		    (req->state != RCM_STATE_SUSPEND_QUERY) &&
570 		    (req->state != RCM_STATE_SUSPEND_QUERY_FAIL) &&
571 		    (req->state != RCM_STATE_SUSPEND_FAIL)) {
572 			rcm_log_message(RCM_DEBUG,
573 			    "%s: invalid resuming from state %d\n",
574 			    device, req->state);
575 			goto failure;
576 		}
577 		break;
578 
579 	case RCM_STATE_ONLINING:
580 		if ((req->state != RCM_STATE_OFFLINE) &&
581 		    (req->state != RCM_STATE_OFFLINE_QUERYING) &&
582 		    (req->state != RCM_STATE_OFFLINE_QUERY) &&
583 		    (req->state != RCM_STATE_OFFLINE_QUERY_FAIL) &&
584 		    (req->state != RCM_STATE_OFFLINE_FAIL)) {
585 			rcm_log_message(RCM_INFO,
586 			    gettext("%s: invalid onlining from state %d\n"),
587 			    device, req->state);
588 			goto failure;
589 		}
590 		break;
591 
592 	case RCM_STATE_REMOVING:
593 		if ((req->state != RCM_STATE_OFFLINE) &&
594 		    (req->state != RCM_STATE_OFFLINE_FAIL)) {
595 			rcm_log_message(RCM_INFO,
596 			    gettext("%s: invalid removing from state %d\n"),
597 			    device, req->state);
598 			goto failure;
599 		}
600 		break;
601 
602 	case RCM_STATE_SUSPEND_FAIL:
603 		assert(req->state == RCM_STATE_SUSPENDING);
604 		break;
605 
606 	case RCM_STATE_OFFLINE_FAIL:
607 		assert(req->state == RCM_STATE_OFFLINING);
608 		break;
609 
610 	case RCM_STATE_SUSPEND:
611 		assert(req->state == RCM_STATE_SUSPENDING);
612 		break;
613 
614 	case RCM_STATE_OFFLINE:
615 		assert(req->state == RCM_STATE_OFFLINING);
616 		break;
617 
618 	case RCM_STATE_ONLINE:
619 		assert((req->state == RCM_STATE_RESUMING) ||
620 		    (req->state == RCM_STATE_ONLINING));
621 		break;
622 
623 	default:	/* shouldn't be here */
624 		rcm_log_message(RCM_ERROR,
625 		    gettext("invalid update to dr state: %d\n"), state);
626 		return (RCM_FAILURE);
627 	}
628 
629 update:
630 	/*
631 	 * update the state, interval, and sequence number; sync state file
632 	 */
633 	req->state = state;
634 	req->seq_num = seq_num;
635 
636 	if (interval)
637 		req->interval = *interval;
638 	else
639 		bzero(&req->interval, sizeof (timespec_t));
640 
641 	(void) fsync(state_fd);
642 	return (RCM_SUCCESS);
643 
644 failure:
645 	if (infop != NULL) {
646 		add_busy_rsrc_to_list(req->device, req->pid, req->state,
647 		    req->seq_num, NULL, locked_info, locked_err, NULL, infop);
648 	}
649 
650 	/*
651 	 * A request may be left in a transition state because the operator
652 	 * typed ctrl-C. In this case, the daemon thread continues to run
653 	 * and will eventually put the state in a non-transitional state.
654 	 *
655 	 * To be safe, we return EAGAIN to allow librcm to loop and retry.
656 	 * If we are called from a module, loop & retry could result in a
657 	 * deadlock. The called will check for this case and turn EAGAIN
658 	 * into RCM_CONFLICT.
659 	 */
660 	if (transition_state(req->state)) {
661 		return (EAGAIN);
662 	}
663 
664 	return (RCM_CONFLICT);
665 }
666 
667 /*
668  * Insert a dr entry in dr_req_list
669  */
670 int
671 dr_req_add(char *rsrcname, pid_t pid, uint_t flag, int state, int seq_num,
672     timespec_t *interval, rcm_info_t **info)
673 {
674 	int error;
675 	char *device;
676 	req_t *req;
677 
678 	rcm_log_message(RCM_TRACE3, "dr_req_add(%s, %ld, 0x%x, %d, %d, %p)\n",
679 	    rsrcname, pid, flag, state, seq_num, (void *)info);
680 
681 	device = resolve_name(rsrcname);
682 	if (device == NULL)
683 		return (EINVAL);
684 
685 	(void) mutex_lock(&rcm_req_lock);
686 
687 	/*
688 	 * In the re-offline/suspend case, attempt to update dr request.
689 	 *
690 	 * If this succeeds, return success;
691 	 * If this fails because of a conflict, return error;
692 	 * If this this fails because no entry exists, add a new entry.
693 	 */
694 	error = dr_req_update_entry(device, pid, flag, state, seq_num, interval,
695 	    info);
696 
697 	switch (error) {
698 	case RCM_FAILURE:
699 		/* proceed to add a new entry */
700 		break;
701 
702 	case RCM_CONFLICT:
703 	case RCM_SUCCESS:
704 	case EAGAIN:
705 	default:
706 		goto out;
707 	}
708 
709 	/*
710 	 * Check for lock conflicts
711 	 */
712 	error = check_lock(device, flag, LOCK_FOR_DR, info);
713 	if (error != RCM_SUCCESS) {
714 		error = RCM_CONFLICT;
715 		goto out;
716 	}
717 
718 	/*
719 	 * Get empty request entry, fill in values and sync state file
720 	 */
721 	req = get_req_entry(&dr_req_list);
722 
723 	req->seq_num = seq_num;
724 	req->pid = pid;
725 	req->flag = flag;
726 	req->state = state;
727 	req->type = rsrc_get_type(device);
728 	(void) strcpy(req->device, device);
729 
730 	/* cache interval for failure recovery */
731 	if (interval)
732 		req->interval = *interval;
733 	else
734 		bzero(&req->interval, sizeof (timespec_t));
735 
736 	(void) fsync(state_fd);
737 
738 	/*
739 	 * Add initiator pid to polling list
740 	 */
741 	add_to_polling_list(req->pid);
742 
743 out:
744 	(void) mutex_unlock(&rcm_req_lock);
745 	free(device);
746 
747 	return (error);
748 }
749 
750 /*
751  * Update a dr entry in dr_req_list
752  */
753 /*ARGSUSED*/
754 int
755 dr_req_update(char *rsrcname, pid_t pid, uint_t flag, int state, int seq_num,
756     rcm_info_t **info)
757 {
758 	int error;
759 	char *device = resolve_name(rsrcname);
760 
761 	rcm_log_message(RCM_TRACE3, "dr_req_update(%s, %ld, 0x%x, %d, %d)\n",
762 	    rsrcname, pid, flag, state, seq_num);
763 
764 	(void) mutex_lock(&rcm_req_lock);
765 	error = dr_req_update_entry(device, pid, flag, state, seq_num, NULL,
766 	    info);
767 	(void) mutex_unlock(&rcm_req_lock);
768 	free(device);
769 
770 	return (error);
771 }
772 
773 /*
774  * This function scans the DR request list for the next, non-removed
775  * entry that is part of the specified sequence.  The 'device' name
776  * of the entry is copied into the provided 'rsrc' buffer.
777  *
778  * The 'rsrc' buffer is required because the DR request list is only
779  * locked during the duration of this lookup.  Giving a direct pointer
780  * to something in the list would be unsafe.
781  */
782 int
783 dr_req_lookup(int seq_num, char *rsrc)
784 {
785 	int	i;
786 	int	len;
787 	int	base = (seq_num >> SEQ_NUM_SHIFT);
788 	int	retval = RCM_FAILURE;
789 
790 	if (rsrc == NULL) {
791 		return (RCM_FAILURE);
792 	}
793 
794 	(void) mutex_lock(&rcm_req_lock);
795 
796 	for (i = 0; i < dr_req_list->n_req_max; i++) {
797 
798 		/* Skip removed or non-matching entries */
799 		if ((dr_req_list->req[i].state == RCM_STATE_REMOVE) ||
800 		    ((dr_req_list->req[i].seq_num >> SEQ_NUM_SHIFT) != base)) {
801 			continue;
802 		}
803 
804 		/* Copy the next-matching 'device' name into 'rsrc' */
805 		len = strlcpy(rsrc, dr_req_list->req[i].device, MAXPATHLEN);
806 		if (len < MAXPATHLEN) {
807 			retval = RCM_SUCCESS;
808 		}
809 		break;
810 	}
811 
812 	(void) mutex_unlock(&rcm_req_lock);
813 
814 	return (retval);
815 }
816 
817 /*
818  * Remove a dr entry in dr_req_list
819  */
820 void
821 dr_req_remove(char *rsrcname, uint_t flag)
822 {
823 	req_t *req;
824 	char *device = resolve_name(rsrcname);
825 
826 	rcm_log_message(RCM_TRACE3, "dr_req_remove(%s)\n", rsrcname);
827 
828 	(void) mutex_lock(&rcm_req_lock);
829 
830 	/* find entry */
831 	req = find_req_entry(device, flag, -1, dr_req_list);
832 	free(device);
833 
834 	if (req == NULL) {
835 		(void) mutex_unlock(&rcm_req_lock);
836 		rcm_log_message(RCM_WARNING,
837 		    gettext("dr_req entry %s not found\n"), rsrcname);
838 		return;
839 	}
840 
841 	req->state = RCM_STATE_REMOVE;
842 	dr_req_list->n_req--;
843 	(void) fsync(state_fd);
844 
845 	/*
846 	 * remove pid from polling list
847 	 */
848 	remove_from_polling_list(req->pid);
849 
850 	/*
851 	 * We don't shrink the dr_req_list size for now.
852 	 * Shouldn't cause big memory leaks.
853 	 */
854 	(void) mutex_unlock(&rcm_req_lock);
855 }
856 
857 /*
858  * Return the list of ongoing dr operation requests
859  */
860 rcm_info_t *
861 rsrc_dr_info()
862 {
863 	int i;
864 	rcm_info_t *info;
865 	rcm_info_t *result = NULL;
866 	char *rsrc;
867 	int len;
868 
869 	rcm_log_message(RCM_TRACE2, "rsrc_dr_info()\n");
870 
871 	(void) mutex_lock(&rcm_req_lock);
872 	for (i = 0; i < dr_req_list->n_req_max; i++) {
873 		if (dr_req_list->req[i].state == RCM_STATE_REMOVE)
874 			continue;
875 
876 		if (dr_req_list->req[i].device[0] == '\0')
877 			continue;
878 
879 		if (dr_req_list->req[i].flag & RCM_FILESYS) {
880 			len = strlen(dr_req_list->req[i].device) + 5;
881 			rsrc = s_malloc(len);
882 			(void) snprintf(rsrc, len, "%s(fs)",
883 			    dr_req_list->req[i].device);
884 		} else {
885 			rsrc = s_strdup(dr_req_list->req[i].device);
886 		}
887 
888 		info = s_calloc(1, sizeof (*info));
889 		if (errno = nvlist_alloc(&(info->info), NV_UNIQUE_NAME, 0)) {
890 			rcm_log_message(RCM_ERROR,
891 			    gettext("failed (nvlist_alloc=%s).\n"),
892 			    strerror(errno));
893 			rcmd_exit(errno);
894 		}
895 
896 		if (errno = nvlist_add_string(info->info, RCM_RSRCNAME, rsrc)) {
897 			rcm_log_message(RCM_ERROR,
898 			    gettext("failed (nvlist_add=%s).\n"),
899 			    strerror(errno));
900 			rcmd_exit(errno);
901 		}
902 		(void) free(rsrc);
903 
904 		if (errno = nvlist_add_int64(info->info, RCM_CLIENT_ID,
905 		    dr_req_list->req[i].pid)) {
906 			rcm_log_message(RCM_ERROR,
907 			    gettext("failed (nvlist_add=%s).\n"),
908 			    strerror(errno));
909 			rcmd_exit(errno);
910 		}
911 
912 		if (errno = nvlist_add_int32(info->info, RCM_SEQ_NUM,
913 		    dr_req_list->req[i].seq_num)) {
914 			rcm_log_message(RCM_ERROR,
915 			    gettext("failed (nvlist_add=%s).\n"),
916 			    strerror(errno));
917 			rcmd_exit(errno);
918 		}
919 
920 		if (errno = nvlist_add_int32(info->info, RCM_RSRCSTATE,
921 		    dr_req_list->req[i].state)) {
922 			rcm_log_message(RCM_ERROR,
923 			    gettext("failed (nvlist_add=%s).\n"),
924 			    strerror(errno));
925 			rcmd_exit(errno);
926 		}
927 
928 		if (errno = nvlist_add_string(info->info, RCM_CLIENT_INFO,
929 		    (char *)locked_info)) {
930 			rcm_log_message(RCM_ERROR,
931 			    gettext("failed (nvlist_add=%s).\n"),
932 			    strerror(errno));
933 			rcmd_exit(errno);
934 		}
935 
936 		info->next = result;
937 		result = info;
938 	}
939 	(void) mutex_unlock(&rcm_req_lock);
940 
941 	return (result);
942 }
943 
944 /*
945  * Eliminate entries whose dr initiator is no longer running
946  * and recover daemon state during daemon restart.
947  *
948  * This routine is called from either during daemon initialization
949  * after all modules have registered resources or from the cleanup
950  * thread. In either case, it is the only thread running in the
951  * daemon.
952  */
953 void
954 clean_dr_list()
955 {
956 	int i;
957 	struct clean_list {
958 		struct clean_list *next;
959 		char *rsrcname;
960 		pid_t pid;
961 		int seq_num;
962 		int state;
963 		timespec_t interval;
964 	} *tmp, *list = NULL;
965 	char *rsrcnames[2];
966 
967 	rcm_log_message(RCM_TRACE3,
968 	    "clean_dr_list(): look for stale dr initiators\n");
969 
970 	rsrcnames[1] = NULL;
971 
972 	/*
973 	 * Make a list of entries to recover. This is necessary because
974 	 * the recovery operation will modify dr_req_list.
975 	 */
976 	(void) mutex_lock(&rcm_req_lock);
977 	for (i = 0; i < dr_req_list->n_req_max; i++) {
978 		/* skip empty entries */
979 		if (dr_req_list->req[i].state == RCM_STATE_REMOVE)
980 			continue;
981 
982 		if (dr_req_list->req[i].device[0] == '\0')
983 			continue;
984 
985 		/* skip cascade operations */
986 		if (dr_req_list->req[i].seq_num & SEQ_NUM_MASK)
987 			continue;
988 
989 		/*
990 		 * In the cleanup case, ignore entries with initiators alive
991 		 */
992 		if ((rcmd_get_state() == RCMD_CLEANUP) &&
993 		    proc_exist(dr_req_list->req[i].pid))
994 			continue;
995 
996 		rcm_log_message(RCM_TRACE1,
997 		    "found stale entry: %s\n", dr_req_list->req[i].device);
998 
999 		tmp = s_malloc(sizeof (*tmp));
1000 		tmp->rsrcname = s_strdup(dr_req_list->req[i].device);
1001 		tmp->state = dr_req_list->req[i].state;
1002 		tmp->pid = dr_req_list->req[i].pid;
1003 		tmp->seq_num = dr_req_list->req[i].seq_num;
1004 		tmp->interval = dr_req_list->req[i].interval;
1005 		tmp->next = list;
1006 		list = tmp;
1007 	}
1008 	(void) mutex_unlock(&rcm_req_lock);
1009 
1010 	if (list == NULL)
1011 		return;
1012 
1013 	/*
1014 	 * If everything worked normally, we shouldn't be here.
1015 	 * Since we are here, something went wrong, so say something.
1016 	 */
1017 	if (rcmd_get_state() == RCMD_INIT) {
1018 		rcm_log_message(RCM_NOTICE, gettext("rcm_daemon died "
1019 		    "unexpectedly, recovering previous daemon state\n"));
1020 	} else {
1021 		rcm_log_message(RCM_INFO, gettext("one or more dr initiator "
1022 		    "died, attempting automatic recovery\n"));
1023 	}
1024 
1025 	while (list) {
1026 		tmp = list;
1027 		list = tmp->next;
1028 
1029 		switch (tmp->state) {
1030 		case RCM_STATE_OFFLINE_QUERY:
1031 		case RCM_STATE_OFFLINE_QUERY_FAIL:
1032 			rsrcnames[0] = tmp->rsrcname;
1033 			if (proc_exist(tmp->pid)) {
1034 				/* redo */
1035 				(void) process_resource_offline(rsrcnames,
1036 				    tmp->pid, RCM_QUERY, tmp->seq_num, NULL);
1037 			} else {
1038 				/* undo */
1039 				(void) notify_resource_online(rsrcnames,
1040 				    tmp->pid, 0, tmp->seq_num, NULL);
1041 			}
1042 			break;
1043 
1044 		case RCM_STATE_OFFLINE:
1045 		case RCM_STATE_OFFLINE_FAIL:
1046 			rsrcnames[0] = tmp->rsrcname;
1047 			if (proc_exist(tmp->pid)) {
1048 				/* redo */
1049 				(void) process_resource_offline(rsrcnames,
1050 				    tmp->pid, 0, tmp->seq_num, NULL);
1051 			} else {
1052 				/* undo */
1053 				(void) notify_resource_online(rsrcnames,
1054 				    tmp->pid, 0, tmp->seq_num, NULL);
1055 			}
1056 			break;
1057 
1058 		case RCM_STATE_SUSPEND_QUERY:
1059 		case RCM_STATE_SUSPEND_QUERY_FAIL:
1060 			rsrcnames[0] = tmp->rsrcname;
1061 			if (proc_exist(tmp->pid)) {
1062 				/* redo */
1063 				(void) process_resource_suspend(rsrcnames,
1064 				    tmp->pid, RCM_QUERY, tmp->seq_num,
1065 				    &tmp->interval, NULL);
1066 			} else {
1067 				/* undo */
1068 				(void) notify_resource_resume(rsrcnames,
1069 				    tmp->pid, 0, tmp->seq_num, NULL);
1070 			}
1071 			break;
1072 
1073 		case RCM_STATE_SUSPEND:
1074 		case RCM_STATE_SUSPEND_FAIL:
1075 			rsrcnames[0] = tmp->rsrcname;
1076 			if (proc_exist(tmp->pid)) {
1077 				/* redo */
1078 				(void) process_resource_suspend(rsrcnames,
1079 				    tmp->pid, 0, tmp->seq_num, &tmp->interval,
1080 				    NULL);
1081 			} else {
1082 				/* undo */
1083 				(void) notify_resource_resume(rsrcnames,
1084 				    tmp->pid, 0, tmp->seq_num, NULL);
1085 			}
1086 			break;
1087 
1088 		case RCM_STATE_OFFLINING:
1089 		case RCM_STATE_ONLINING:
1090 			rsrcnames[0] = tmp->rsrcname;
1091 			(void) notify_resource_online(rsrcnames, tmp->pid, 0,
1092 			    tmp->seq_num, NULL);
1093 			break;
1094 
1095 		case RCM_STATE_SUSPENDING:
1096 		case RCM_STATE_RESUMING:
1097 			rsrcnames[0] = tmp->rsrcname;
1098 			(void) notify_resource_resume(rsrcnames, tmp->pid, 0,
1099 			    tmp->seq_num, NULL);
1100 			break;
1101 
1102 		case RCM_STATE_REMOVING:
1103 			rsrcnames[0] = tmp->rsrcname;
1104 			(void) notify_resource_remove(rsrcnames, tmp->pid, 0,
1105 			    tmp->seq_num, NULL);
1106 			break;
1107 
1108 		default:
1109 			rcm_log_message(RCM_WARNING,
1110 			    gettext("%s in unknown state %d\n"),
1111 			    tmp->rsrcname, tmp->state);
1112 			break;
1113 		}
1114 		free(tmp->rsrcname);
1115 		free(tmp);
1116 	}
1117 }
1118 
1119 /*
1120  * Selected thread blocking based on event type
1121  */
1122 barrier_t barrier;
1123 
1124 /*
1125  * Change barrier state:
1126  *	RCMD_INIT - daemon is intializing, only register allowed
1127  *	RCMD_NORMAL - normal daemon processing
1128  *	RCMD_CLEANUP - cleanup thread is waiting or running
1129  */
1130 int
1131 rcmd_get_state()
1132 {
1133 	return (barrier.state);
1134 }
1135 
1136 void
1137 rcmd_set_state(int state)
1138 {
1139 	/*
1140 	 * The state transition is as follows:
1141 	 *	INIT --> NORMAL <---> CLEANUP
1142 	 * The implementation favors the cleanup thread
1143 	 */
1144 
1145 	(void) mutex_lock(&barrier.lock);
1146 	barrier.state = state;
1147 
1148 	switch (state) {
1149 	case RCMD_CLEANUP:
1150 		/*
1151 		 * Wait for existing threads to exit
1152 		 */
1153 		barrier.wanted++;
1154 		while (barrier.thr_count != 0)
1155 			(void) cond_wait(&barrier.cv, &barrier.lock);
1156 		barrier.wanted--;
1157 		barrier.thr_count = -1;
1158 		break;
1159 
1160 	case RCMD_INIT:
1161 	case RCMD_NORMAL:
1162 	default:
1163 		if (barrier.thr_count == -1)
1164 			barrier.thr_count = 0;
1165 		if (barrier.wanted)
1166 			(void) cond_broadcast(&barrier.cv);
1167 		break;
1168 	}
1169 
1170 	(void) mutex_unlock(&barrier.lock);
1171 }
1172 
1173 /*
1174  * Increment daemon thread count
1175  */
1176 int
1177 rcmd_thr_incr(int cmd)
1178 {
1179 	int seq_num;
1180 
1181 	(void) mutex_lock(&barrier.lock);
1182 	/*
1183 	 * Set wanted flag
1184 	 */
1185 	barrier.wanted++;
1186 
1187 	/*
1188 	 * Wait till it is safe for daemon to perform the operation
1189 	 *
1190 	 * NOTE: if a module registers by passing a request to the
1191 	 *	client proccess, we may need to allow register
1192 	 *	to come through during daemon initialization.
1193 	 */
1194 	while (barrier.state != RCMD_NORMAL)
1195 		(void) cond_wait(&barrier.cv, &barrier.lock);
1196 
1197 	if ((cmd == CMD_EVENT) ||
1198 	    (cmd == CMD_REGISTER) ||
1199 	    (cmd == CMD_UNREGISTER)) {
1200 		/*
1201 		 * Event passthru and register ops don't need sequence number
1202 		 */
1203 		seq_num = -1;
1204 	} else {
1205 		/*
1206 		 * Non register operation gets a sequence number
1207 		 */
1208 		seq_num = get_seq_number();
1209 	}
1210 	barrier.wanted--;
1211 	barrier.thr_count++;
1212 	(void) mutex_unlock(&barrier.lock);
1213 
1214 	if ((cmd == CMD_OFFLINE) ||
1215 	    (cmd == CMD_SUSPEND) ||
1216 	    (cmd == CMD_GETINFO)) {
1217 		/*
1218 		 * For these operations, need to ask modules to
1219 		 * register any new resources that came online.
1220 		 *
1221 		 * This is because mount/umount are not instrumented
1222 		 * to register with rcm before using system resources.
1223 		 * Certain registration ops may fail during sync, which
1224 		 * indicates race conditions. This cannot be avoided
1225 		 * without changing mount/umount.
1226 		 */
1227 		rcmd_db_sync();
1228 	}
1229 
1230 	return (seq_num);
1231 }
1232 
1233 /*
1234  * Decrement thread count
1235  */
1236 void
1237 rcmd_thr_decr()
1238 {
1239 	/*
1240 	 * Decrement thread count and wake up reload/cleanup thread.
1241 	 */
1242 	(void) mutex_lock(&barrier.lock);
1243 	barrier.last_update = time(NULL);
1244 	if (--barrier.thr_count == 0)
1245 		(void) cond_broadcast(&barrier.cv);
1246 	(void) mutex_unlock(&barrier.lock);
1247 }
1248 
1249 /*
1250  * Wakeup all waiting threads as a result of SIGHUP
1251  */
1252 static int sighup_received = 0;
1253 
1254 void
1255 rcmd_thr_signal()
1256 {
1257 	(void) mutex_lock(&barrier.lock);
1258 	sighup_received = 1;
1259 	(void) cond_broadcast(&barrier.cv);
1260 	(void) mutex_unlock(&barrier.lock);
1261 }
1262 
1263 void
1264 rcmd_start_timer(int timeout)
1265 {
1266 	timestruc_t abstime;
1267 
1268 	if (timeout == 0)
1269 		timeout = RCM_DAEMON_TIMEOUT;	/* default to 5 minutes */
1270 	else
1271 		dr_req_list->idle_timeout = timeout;	/* persist timeout */
1272 
1273 	if (timeout > 0) {
1274 		abstime.tv_sec = time(NULL) + timeout;
1275 	}
1276 
1277 	(void) mutex_lock(&barrier.lock);
1278 	for (;;) {
1279 		int idletime;
1280 		int is_active;
1281 
1282 		if (timeout > 0)
1283 			(void) cond_timedwait(&barrier.cv, &barrier.lock,
1284 			    &abstime);
1285 		else
1286 			(void) cond_wait(&barrier.cv, &barrier.lock);
1287 
1288 		/*
1289 		 * If sighup received, change timeout to 0 so the daemon is
1290 		 * shut down at the first possible moment
1291 		 */
1292 		if (sighup_received)
1293 			timeout = 0;
1294 
1295 		/*
1296 		 * If timeout is negative, never shutdown the daemon
1297 		 */
1298 		if (timeout < 0)
1299 			continue;
1300 
1301 		/*
1302 		 * Check for ongoing/pending activity
1303 		 */
1304 		is_active = (barrier.thr_count || barrier.wanted ||
1305 		    (dr_req_list->n_req != 0));
1306 		if (is_active) {
1307 			abstime.tv_sec = time(NULL) + timeout;
1308 			continue;
1309 		}
1310 
1311 		/*
1312 		 * If idletime is less than timeout, continue to wait
1313 		 */
1314 		idletime = time(NULL) - barrier.last_update;
1315 		if (idletime < timeout) {
1316 			abstime.tv_sec = barrier.last_update + timeout;
1317 			continue;
1318 		}
1319 		break;
1320 	}
1321 
1322 	(void) script_main_fini();
1323 
1324 	rcm_log_message(RCM_INFO, gettext("rcm_daemon is shut down.\n"));
1325 }
1326 
1327 /*
1328  * Code related to polling client pid's
1329  * Not declared as static so that we can find this structure easily
1330  * in the core file.
1331  */
1332 struct {
1333 	int		n_pids;
1334 	int		n_max_pids;
1335 	thread_t	poll_tid;	/* poll thread id */
1336 	int		signaled;
1337 	pid_t		*pids;
1338 	int		*refcnt;
1339 	struct pollfd	*fds;
1340 	cond_t		cv;	/* the associated lock is rcm_req_lock */
1341 } polllist;
1342 
1343 static int
1344 find_pid_index(pid_t pid)
1345 {
1346 	int i;
1347 
1348 	for (i = 0; i < polllist.n_pids; i++) {
1349 		if (polllist.pids[i] == pid) {
1350 			return (i);
1351 		}
1352 	}
1353 	return (-1);
1354 }
1355 
1356 /*
1357  * Resize buffer for new pids
1358  */
1359 static int
1360 get_pid_index()
1361 {
1362 	const int n_chunk = 10;
1363 
1364 	int n_max;
1365 	int index = polllist.n_pids;
1366 
1367 	if (polllist.n_pids < polllist.n_max_pids) {
1368 		polllist.n_pids++;
1369 		return (index);
1370 	}
1371 
1372 	if (polllist.n_max_pids == 0) {
1373 		n_max = n_chunk;
1374 		polllist.pids = s_calloc(n_max, sizeof (pid_t));
1375 		polllist.refcnt = s_calloc(n_max, sizeof (int));
1376 		polllist.fds = s_calloc(n_max, sizeof (struct pollfd));
1377 	} else {
1378 		n_max = polllist.n_max_pids + n_chunk;
1379 		polllist.pids = s_realloc(polllist.pids,
1380 		    n_max * sizeof (pid_t));
1381 		polllist.refcnt = s_realloc(polllist.refcnt,
1382 		    n_max * sizeof (int));
1383 		polllist.fds = s_realloc(polllist.fds,
1384 		    n_max * sizeof (struct pollfd));
1385 	}
1386 	polllist.n_max_pids = n_max;
1387 	polllist.n_pids++;
1388 	return (index);
1389 }
1390 
1391 /*
1392  * rcm_req_lock must be held
1393  */
1394 static void
1395 add_to_polling_list(pid_t pid)
1396 {
1397 	int fd, index;
1398 	char procfile[MAXPATHLEN];
1399 
1400 	if (pid == (pid_t)0)
1401 		return;
1402 
1403 	rcm_log_message(RCM_TRACE1, "add_to_polling_list(%ld)\n", pid);
1404 
1405 	/*
1406 	 * Need to stop the poll thread before manipulating the polllist
1407 	 * since poll thread may possibly be using polllist.fds[] and
1408 	 * polllist.n_pids. As an optimization, first check if the pid
1409 	 * is already in the polllist. If it is, there is no need to
1410 	 * stop the poll thread. Just increment the pid reference count
1411 	 * and return;
1412 	 */
1413 	index = find_pid_index(pid);
1414 	if (index != -1) {
1415 		polllist.refcnt[index]++;
1416 		return;
1417 	}
1418 
1419 	stop_polling_thread();
1420 
1421 	/*
1422 	 * In an attempt to stop the poll thread we may have released
1423 	 * and reacquired rcm_req_lock. So find the index again.
1424 	 */
1425 	index = find_pid_index(pid);
1426 	if (index != -1) {
1427 		polllist.refcnt[index]++;
1428 		goto done;
1429 	}
1430 
1431 	/*
1432 	 * Open a /proc file
1433 	 */
1434 	(void) sprintf(procfile, "/proc/%ld/as", pid);
1435 	if ((fd = open(procfile, O_RDONLY)) == -1) {
1436 		rcm_log_message(RCM_NOTICE, gettext("open(%s): %s\n"),
1437 		    procfile, strerror(errno));
1438 		goto done;
1439 	}
1440 
1441 	/*
1442 	 * add pid to polllist
1443 	 */
1444 	index = get_pid_index();
1445 	polllist.pids[index] = pid;
1446 	polllist.refcnt[index] = 1;
1447 	polllist.fds[index].fd = fd;
1448 	polllist.fds[index].events = 0;
1449 	polllist.fds[index].revents = 0;
1450 
1451 	rcm_log_message(RCM_DEBUG, "add pid %ld at index %ld\n", pid, index);
1452 
1453 done:
1454 	start_polling_thread();
1455 }
1456 
1457 /*
1458  * rcm_req_lock must be held
1459  */
1460 static void
1461 remove_from_polling_list(pid_t pid)
1462 {
1463 	int i, index;
1464 
1465 	if (pid == (pid_t)0)
1466 		return;
1467 
1468 	rcm_log_message(RCM_TRACE1, "remove_from_polling_list(%ld)\n", pid);
1469 
1470 	/*
1471 	 * Need to stop the poll thread before manipulating the polllist
1472 	 * since poll thread may possibly be using polllist.fds[] and
1473 	 * polllist.n_pids. As an optimization, first check the pid
1474 	 * reference count. If the pid reference count is greater than 1
1475 	 * there is no need to stop the polling thread.
1476 	 */
1477 
1478 	index = find_pid_index(pid);
1479 	if (index == -1) {
1480 		rcm_log_message(RCM_NOTICE,
1481 		    gettext("error removing pid %ld from polling list\n"), pid);
1482 		return;
1483 	}
1484 
1485 	/*
1486 	 * decrement the pid refcnt
1487 	 */
1488 	if (polllist.refcnt[index] > 1) {
1489 		polllist.refcnt[index]--;
1490 		return;
1491 	}
1492 
1493 	stop_polling_thread();
1494 
1495 	/*
1496 	 * In an attempt to stop the poll thread we may have released
1497 	 * and reacquired rcm_req_lock. So find the index again.
1498 	 */
1499 	index = find_pid_index(pid);
1500 	if (index == -1) {
1501 		rcm_log_message(RCM_NOTICE,
1502 		    gettext("error removing pid %ld from polling list\n"), pid);
1503 		goto done;
1504 	}
1505 
1506 	if (--polllist.refcnt[index] > 0)
1507 		goto done;
1508 
1509 	/*
1510 	 * refcnt down to zero, delete pid from polling list
1511 	 */
1512 	(void) close(polllist.fds[index].fd);
1513 	polllist.n_pids--;
1514 
1515 	for (i = index; i < polllist.n_pids; i++) {
1516 		polllist.pids[i] = polllist.pids[i + 1];
1517 		polllist.refcnt[i] = polllist.refcnt[i + 1];
1518 		bcopy(&polllist.fds[i + 1], &polllist.fds[i],
1519 		    sizeof (struct pollfd));
1520 	}
1521 
1522 	rcm_log_message(RCM_DEBUG, "remove pid %ld at index %d\n", pid, index);
1523 
1524 done:
1525 	start_polling_thread();
1526 }
1527 
1528 void
1529 init_poll_thread()
1530 {
1531 	polllist.poll_tid = (thread_t)-1;
1532 }
1533 
1534 void
1535 cleanup_poll_thread()
1536 {
1537 	(void) mutex_lock(&rcm_req_lock);
1538 	if (polllist.poll_tid == thr_self()) {
1539 		rcm_log_message(RCM_TRACE2,
1540 		    "cleanup_poll_thread: n_pids = %d\n", polllist.n_pids);
1541 		polllist.poll_tid = (thread_t)-1;
1542 		(void) cond_broadcast(&polllist.cv);
1543 	}
1544 	(void) mutex_unlock(&rcm_req_lock);
1545 }
1546 
1547 /*ARGSUSED*/
1548 static void *
1549 pollfunc(void *arg)
1550 {
1551 	sigset_t mask;
1552 
1553 	rcm_log_message(RCM_TRACE2, "poll thread started. n_pids = %d\n",
1554 	    polllist.n_pids);
1555 
1556 	/*
1557 	 * Unblock SIGUSR1 to allow polling thread to be killed
1558 	 */
1559 	(void) sigemptyset(&mask);
1560 	(void) sigaddset(&mask, SIGUSR1);
1561 	(void) thr_sigsetmask(SIG_UNBLOCK, &mask, NULL);
1562 
1563 	(void) poll(polllist.fds, polllist.n_pids, (time_t)-1);
1564 
1565 	/*
1566 	 * block SIGUSR1 to avoid being killed while holding a lock
1567 	 */
1568 	(void) sigemptyset(&mask);
1569 	(void) sigaddset(&mask, SIGUSR1);
1570 	(void) thr_sigsetmask(SIG_BLOCK, &mask, NULL);
1571 
1572 	rcm_log_message(RCM_TRACE2, "returned from poll()\n");
1573 
1574 	cleanup_poll_thread();
1575 
1576 	(void) mutex_lock(&barrier.lock);
1577 	need_cleanup = 1;
1578 	(void) cond_broadcast(&barrier.cv);
1579 	(void) mutex_unlock(&barrier.lock);
1580 
1581 	return (NULL);
1582 }
1583 
1584 /*
1585  * rcm_req_lock must be held
1586  */
1587 void
1588 start_polling_thread()
1589 {
1590 	int err;
1591 
1592 	if (rcmd_get_state() != RCMD_NORMAL)
1593 		return;
1594 
1595 	if (polllist.poll_tid != (thread_t)-1 || polllist.n_pids == 0)
1596 		return;
1597 
1598 	if ((err = thr_create(NULL, 0, pollfunc, NULL, THR_DETACHED,
1599 	    &polllist.poll_tid)) == 0)
1600 		polllist.signaled = 0;
1601 	else
1602 		rcm_log_message(RCM_ERROR,
1603 		    gettext("failed to create polling thread: %s\n"),
1604 		    strerror(err));
1605 }
1606 
1607 /*
1608  * rcm_req_lock must be held
1609  */
1610 static void
1611 stop_polling_thread()
1612 {
1613 	int err;
1614 
1615 	while (polllist.poll_tid != (thread_t)-1) {
1616 		if (polllist.signaled == 0) {
1617 			if ((err = thr_kill(polllist.poll_tid, SIGUSR1)) == 0)
1618 				polllist.signaled = 1;
1619 			else
1620 				/*
1621 				 * thr_kill shouldn't have failed since the
1622 				 * poll thread id and the signal are valid.
1623 				 * So log an error. Since when thr_kill
1624 				 * fails no signal is sent (as per man page),
1625 				 * the cond_wait below will wait until the
1626 				 * the poll thread exits by some other means.
1627 				 * The poll thread, for example, exits on its
1628 				 * own when any DR initiator process that it
1629 				 * is currently polling exits.
1630 				 */
1631 				rcm_log_message(RCM_ERROR,
1632 				    gettext(
1633 				    "fail to kill polling thread %d: %s\n"),
1634 				    polllist.poll_tid, strerror(err));
1635 		}
1636 		(void) cond_wait(&polllist.cv, &rcm_req_lock);
1637 	}
1638 }
1639