xref: /illumos-gate/usr/src/cmd/rcm_daemon/common/rcm_lock.c (revision d48be21240dfd051b689384ce2b23479d757f2d8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  *
22  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include "rcm_impl.h"
27 #include "rcm_module.h"
28 
29 /*
30  * Global locks
31  */
32 mutex_t rcm_req_lock;	/* protects global dr & info request list */
33 
34 /*
35  * Daemon state file
36  */
37 static int state_fd;
38 #define	RCM_STATE_FILE	"/var/run/rcm_daemon_state"
39 #define	N_REQ_CHUNK	10	/* grow 10 entries at a time */
40 
41 /*
42  * Daemon timeout value
43  */
44 #define	RCM_DAEMON_TIMEOUT	300	/* 5 minutes idle time */
45 
46 /*
47  * Struct for a list of outstanding rcm requests
48  */
49 typedef struct {
50 	int	seq_num;		/* sequence number of request */
51 	int	state;			/* current state */
52 	pid_t	pid;			/* pid of initiator */
53 	uint_t	flag;			/* request flags */
54 	int	type;			/* resource(device) type */
55 	timespec_t interval;		/* suspend interval */
56 	char	device[MAXPATHLEN];	/* name of device or resource */
57 } req_t;
58 
59 typedef struct {
60 	int	n_req;
61 	int	n_req_max;	/* number of req_t's to follow */
62 	int	n_seq_max;	/* last sequence number */
63 	int	idle_timeout;	/* persist idle timeout value */
64 	req_t	req[1];
65 	/* more req_t follows */
66 } req_list_t;
67 
68 static req_list_t *dr_req_list;
69 static req_list_t *info_req_list;
70 
71 static const char *locked_info = "DR operation in progress";
72 static const char *locked_err = "Resource is busy";
73 
74 static int rcmd_get_state();
75 static void add_to_polling_list(pid_t);
76 static void remove_from_polling_list(pid_t);
77 
78 void start_polling_thread();
79 static void stop_polling_thread();
80 
81 /*
82  * Initialize request lists required for locking
83  */
84 void
85 rcmd_lock_init(void)
86 {
87 	int size;
88 	struct stat fbuf;
89 
90 	/*
91 	 * Start info list with one slot, then grow on demand.
92 	 */
93 	info_req_list = s_calloc(1, sizeof (req_list_t));
94 	info_req_list->n_req_max = 1;
95 
96 	/*
97 	 * Open daemon state file and map in contents
98 	 */
99 	state_fd = open(RCM_STATE_FILE, O_CREAT|O_RDWR, 0600);
100 	if (state_fd == -1) {
101 		rcm_log_message(RCM_ERROR, gettext("cannot open %s: %s\n"),
102 		    RCM_STATE_FILE, strerror(errno));
103 		rcmd_exit(errno);
104 	}
105 
106 	if (fstat(state_fd, &fbuf) != 0) {
107 		rcm_log_message(RCM_ERROR, gettext("cannot stat %s: %s\n"),
108 		    RCM_STATE_FILE, strerror(errno));
109 		rcmd_exit(errno);
110 	}
111 
112 	size = fbuf.st_size;
113 	if (size == 0) {
114 		size = sizeof (req_list_t);
115 		if (ftruncate(state_fd, size) != 0) {
116 			rcm_log_message(RCM_ERROR,
117 			    gettext("cannot truncate %s: %s\n"),
118 			    RCM_STATE_FILE, strerror(errno));
119 			rcmd_exit(errno);
120 		}
121 	}
122 
123 	/*LINTED*/
124 	dr_req_list = (req_list_t *)mmap(NULL, size, PROT_READ|PROT_WRITE,
125 	    MAP_SHARED, state_fd, 0);
126 	if (dr_req_list == MAP_FAILED) {
127 		rcm_log_message(RCM_ERROR, gettext("cannot mmap %s: %s\n"),
128 		    RCM_STATE_FILE, strerror(errno));
129 		rcmd_exit(errno);
130 	}
131 
132 	/*
133 	 * Initial size is one entry
134 	 */
135 	if (dr_req_list->n_req_max == 0) {
136 		dr_req_list->n_req_max = 1;
137 		(void) fsync(state_fd);
138 		return;
139 	}
140 
141 	rcm_log_message(RCM_DEBUG, "n_req = %d, n_req_max = %d\n",
142 	    dr_req_list->n_req, dr_req_list->n_req_max);
143 
144 	/*
145 	 * Recover the daemon state
146 	 */
147 	clean_dr_list();
148 }
149 
150 /*
151  * Get a unique sequence number--to be called with rcm_req_lock held.
152  */
153 static int
154 get_seq_number()
155 {
156 	int number;
157 
158 	if (dr_req_list == NULL)
159 		return (0);
160 
161 	dr_req_list->n_seq_max++;
162 	number  = (dr_req_list->n_seq_max << SEQ_NUM_SHIFT);
163 	(void) fsync(state_fd);
164 
165 	return (number);
166 }
167 
168 /*
169  * Find entry in list with the same resource name and sequence number.
170  * If seq_num == -1, no seq_num matching is required.
171  */
172 static req_t *
173 find_req_entry(char *device, uint_t flag, int seq_num, req_list_t *list)
174 {
175 	int i;
176 
177 	/*
178 	 * Look for entry with the same resource and seq_num.
179 	 * Also match RCM_FILESYS field in flag.
180 	 */
181 	for (i = 0; i < list->n_req_max; i++) {
182 		if (list->req[i].state == RCM_STATE_REMOVE)
183 			/* stale entry */
184 			continue;
185 		/*
186 		 * We need to distiguish a file system root from the directory
187 		 * it is mounted on.
188 		 *
189 		 * Applications are not aware of any difference between the
190 		 * two, but the system keeps track of it internally by
191 		 * checking for mount points while traversing file path.
192 		 * In a similar spirit, RCM is keeping this difference as
193 		 * an implementation detail.
194 		 */
195 		if ((strcmp(device, list->req[i].device) != 0) ||
196 		    (list->req[i].flag & RCM_FILESYS) != (flag & RCM_FILESYS))
197 			/* different resource */
198 			continue;
199 
200 		if ((seq_num != -1) && ((seq_num >> SEQ_NUM_SHIFT) !=
201 		    (list->req[i].seq_num >> SEQ_NUM_SHIFT)))
202 			/* different base seqnum */
203 			continue;
204 
205 		return (&list->req[i]);
206 	}
207 
208 	return (NULL);
209 }
210 
211 /*
212  * Get the next empty req_t entry. If no entry exists, grow the list.
213  */
214 static req_t *
215 get_req_entry(req_list_t **listp)
216 {
217 	int i;
218 	int n_req = (*listp)->n_req;
219 	int n_req_max = (*listp)->n_req_max;
220 
221 	/*
222 	 * If the list is full, grow the list and return the first
223 	 * entry in the new portion.
224 	 */
225 	if (n_req == n_req_max) {
226 		int newsize;
227 
228 		n_req_max += N_REQ_CHUNK;
229 		newsize = sizeof (req_list_t) + (n_req_max - 1) *
230 		    sizeof (req_t);
231 
232 		if (listp == &info_req_list) {
233 			*listp = s_realloc(*listp, newsize);
234 		} else if (ftruncate(state_fd, newsize) != 0) {
235 			rcm_log_message(RCM_ERROR,
236 			    gettext("cannot truncate %s: %s\n"),
237 			    RCM_STATE_FILE, strerror(errno));
238 			rcmd_exit(errno);
239 		/*LINTED*/
240 		} else if ((*listp = (req_list_t *)mmap(NULL, newsize,
241 		    PROT_READ|PROT_WRITE, MAP_SHARED, state_fd, 0)) ==
242 		    MAP_FAILED) {
243 			rcm_log_message(RCM_ERROR,
244 			    gettext("cannot mmap %s: %s\n"),
245 			    RCM_STATE_FILE, strerror(errno));
246 			rcmd_exit(errno);
247 		}
248 
249 		/* Initialize the new entries */
250 		for (i = (*listp)->n_req_max; i < n_req_max; i++) {
251 			(*listp)->req[i].state = RCM_STATE_REMOVE;
252 			(void) strcpy((*listp)->req[i].device, "");
253 		}
254 
255 		(*listp)->n_req_max = n_req_max;
256 		(*listp)->n_req++;
257 		return (&(*listp)->req[n_req]);
258 	}
259 
260 	/*
261 	 * List contains empty slots, find it.
262 	 */
263 	for (i = 0; i < n_req_max; i++) {
264 		if (((*listp)->req[i].device[0] == '\0') ||
265 		    ((*listp)->req[i].state == RCM_STATE_REMOVE)) {
266 			break;
267 		}
268 	}
269 
270 	assert(i < n_req_max);	/* empty slot must exist */
271 
272 	(*listp)->n_req++;
273 	return (&(*listp)->req[i]);
274 }
275 
276 /*
277  * When one resource depends on multiple resources, it's possible that
278  * rcm_get_info can be called multiple times on the resource, resulting
279  * in duplicate information. By assigning a unique sequence number to
280  * each rcm_get_info operation, this duplication can be eliminated.
281  *
282  * Insert a dr entry in info_req_list
283  */
284 int
285 info_req_add(char *rsrcname, uint_t flag, int seq_num)
286 {
287 	int error = 0;
288 	char *device;
289 	req_t *req;
290 
291 	rcm_log_message(RCM_TRACE2, "info_req_add(%s, %d)\n",
292 	    rsrcname, seq_num);
293 
294 	device = resolve_name(rsrcname);
295 	(void) mutex_lock(&rcm_req_lock);
296 
297 	/*
298 	 * Look for entry with the same resource and seq_num.
299 	 * If it exists, we return an error so that such
300 	 * information is not gathered more than once.
301 	 */
302 	if (find_req_entry(device, flag, seq_num, info_req_list) != NULL) {
303 		rcm_log_message(RCM_DEBUG, "getinfo cycle: %s %d \n",
304 		    device, seq_num);
305 		error = -1;
306 		goto out;
307 	}
308 
309 	/*
310 	 * Get empty entry and fill in seq_num and device.
311 	 */
312 	req = get_req_entry(&info_req_list);
313 	req->seq_num = seq_num;
314 	req->state = RCM_STATE_ONLINE;  /* mark that the entry is in use */
315 	req->flag = flag;
316 	(void) strcpy(req->device, device);
317 
318 out:
319 	(void) mutex_unlock(&rcm_req_lock);
320 	free(device);
321 
322 	return (error);
323 }
324 
325 /*
326  * Remove all entries associated with seq_num from info_req_list
327  */
328 void
329 info_req_remove(int seq_num)
330 {
331 	int i;
332 
333 	rcm_log_message(RCM_TRACE3, "info_req_remove(%d)\n", seq_num);
334 
335 	seq_num >>= SEQ_NUM_SHIFT;
336 	(void) mutex_lock(&rcm_req_lock);
337 
338 	/* remove all entries with seq_num */
339 	for (i = 0; i < info_req_list->n_req_max; i++) {
340 		if (info_req_list->req[i].state == RCM_STATE_REMOVE)
341 			continue;
342 
343 		if ((info_req_list->req[i].seq_num >> SEQ_NUM_SHIFT) != seq_num)
344 			continue;
345 
346 		info_req_list->req[i].state = RCM_STATE_REMOVE;
347 		info_req_list->n_req--;
348 	}
349 
350 	/*
351 	 * We don't shrink the info_req_list size for now.
352 	 */
353 	(void) mutex_unlock(&rcm_req_lock);
354 }
355 
356 /*
357  * Checking lock conflicts. There is a conflict if:
358  * - attempt to DR a node when either its ancester or descendent
359  *	is in the process of DR
360  * - attempt to register for a node when its ancester is locked for DR
361  */
362 static int
363 check_lock(char *device, uint_t flag, int cflag, rcm_info_t **info)
364 {
365 	int i, ret = RCM_SUCCESS;
366 
367 	if (info)
368 		*info = NULL;
369 
370 	/*
371 	 * During daemon initialization, don't check locks
372 	 */
373 	if (dr_req_list == NULL)
374 		return (ret);
375 
376 	for (i = 0; i < dr_req_list->n_req; i++) {
377 		req_t *req = &dr_req_list->req[i];
378 		char *dr_dev = req->device;
379 
380 		/*
381 		 * Skip empty entries
382 		 */
383 		if ((req->state == RCM_STATE_REMOVE) || (dr_dev[0] == '\0'))
384 			continue;
385 
386 		/*
387 		 * Make sure that none of the ancestors of dr_dev is
388 		 * being operated upon.
389 		 */
390 		if (EQUAL(device, dr_dev) || DESCENDENT(device, dr_dev)) {
391 			/*
392 			 * An exception to this is the filesystem.
393 			 * We should allowed a filesystem rooted at a
394 			 * child directory to be unmounted.
395 			 */
396 			if ((flag & RCM_FILESYS) && (!EQUAL(device, dr_dev) ||
397 			    ((dr_req_list->req[i].flag & RCM_FILESYS) == 0)))
398 				continue;
399 
400 			assert(info != 0);
401 
402 			add_busy_rsrc_to_list(dr_dev, dr_req_list->req[i].pid,
403 			    dr_req_list->req[i].state,
404 			    dr_req_list->req[i].seq_num, NULL, locked_info,
405 			    locked_err, NULL, info);
406 			ret = RCM_CONFLICT;
407 			break;
408 		}
409 
410 		if ((cflag == LOCK_FOR_DR) && DESCENDENT(dr_dev, device)) {
411 			/*
412 			 * Check descendents only for DR request.
413 			 *
414 			 * Could have multiple descendents doing DR,
415 			 * we want to find them all.
416 			 */
417 			assert(info != 0);
418 
419 			add_busy_rsrc_to_list(dr_dev, dr_req_list->req[i].pid,
420 			    dr_req_list->req[i].state,
421 			    dr_req_list->req[i].seq_num, NULL, locked_info,
422 			    locked_err, NULL, info);
423 			ret = RCM_CONFLICT;
424 			/* don't break here, need to find all conflicts */
425 		}
426 	}
427 
428 	return (ret);
429 }
430 
431 /*
432  * Check for lock conflicts for DR operation or client registration
433  */
434 int
435 rsrc_check_lock_conflicts(char *rsrcname, uint_t flag, int cflag,
436     rcm_info_t **info)
437 {
438 	int result;
439 	char *device;
440 
441 	device = resolve_name(rsrcname);
442 	result = check_lock(device, flag, cflag, info);
443 	free(device);
444 
445 	return (result);
446 }
447 
448 static int
449 transition_state(int state)
450 {
451 	/*
452 	 * If the resource state is in transition, ask caller to
453 	 * try again.
454 	 */
455 	switch (state) {
456 	case RCM_STATE_OFFLINING:
457 	case RCM_STATE_SUSPENDING:
458 	case RCM_STATE_RESUMING:
459 	case RCM_STATE_ONLINING:
460 	case RCM_STATE_REMOVING:
461 
462 		return (1);
463 
464 	default:
465 		/*FALLTHROUGH*/
466 		break;
467 	}
468 	return (0);
469 }
470 
471 /*
472  * Update a dr entry in dr_req_list
473  */
474 /*ARGSUSED*/
475 static int
476 dr_req_update_entry(char *device, pid_t pid, uint_t flag, int state,
477     int seq_num, timespec_t *interval, rcm_info_t **infop)
478 {
479 	req_t *req;
480 
481 	/*
482 	 * Find request entry. If not found, return RCM_FAILURE
483 	 */
484 	req = find_req_entry(device, flag, -1, dr_req_list);
485 
486 	if (req == NULL) {
487 		switch (state) {
488 		case RCM_STATE_OFFLINE_QUERYING:
489 		case RCM_STATE_SUSPEND_QUERYING:
490 		case RCM_STATE_OFFLINING:
491 		case RCM_STATE_SUSPENDING:
492 			/* could be re-do operation, no error message */
493 			break;
494 
495 		default:
496 			rcm_log_message(RCM_DEBUG,
497 			    "update non-existing resource %s\n", device);
498 		}
499 		return (RCM_FAILURE);
500 	}
501 
502 	/*
503 	 * During initialization, update is unconditional (forced)
504 	 * in order to bring the daemon up in a sane state.
505 	 */
506 	if (rcmd_get_state() == RCMD_INIT)
507 		goto update;
508 
509 	/*
510 	 * Don't allow update with mismatched initiator pid. This could happen
511 	 * as part of normal operation.
512 	 */
513 	if (pid != req->pid) {
514 		rcm_log_message(RCM_INFO,
515 		    gettext("mismatched dr initiator pid: %ld %ld\n"),
516 		    req->pid, pid);
517 		goto failure;
518 	}
519 
520 	rcm_log_message(RCM_TRACE4,
521 	    "dr_req_update_entry: state=%d, device=%s\n",
522 	    req->state, req->device);
523 
524 	/*
525 	 * Check that the state transition is valid
526 	 */
527 	switch (state) {
528 	case RCM_STATE_OFFLINE_QUERYING:
529 	case RCM_STATE_OFFLINING:
530 		/*
531 		 * This is the case of re-offlining, which applies only
532 		 * if a previous attempt failed.
533 		 */
534 		if ((req->state != RCM_STATE_OFFLINE_FAIL) &&
535 		    (req->state != RCM_STATE_OFFLINE_QUERYING) &&
536 		    (req->state != RCM_STATE_OFFLINE_QUERY) &&
537 		    (req->state != RCM_STATE_OFFLINE_QUERY_FAIL) &&
538 		    (req->state != RCM_STATE_OFFLINE)) {
539 			rcm_log_message(RCM_WARNING,
540 			    gettext("%s: invalid offlining from state %d\n"),
541 			    device, req->state);
542 			goto failure;
543 		}
544 		break;
545 
546 	case RCM_STATE_SUSPEND_QUERYING:
547 	case RCM_STATE_SUSPENDING:
548 		/*
549 		 * This is the case of re-suspending, which applies only
550 		 * if a previous attempt failed.
551 		 */
552 		if ((req->state != RCM_STATE_SUSPEND_FAIL) &&
553 		    (req->state != RCM_STATE_SUSPEND_QUERYING) &&
554 		    (req->state != RCM_STATE_SUSPEND_QUERY) &&
555 		    (req->state != RCM_STATE_SUSPEND_QUERY_FAIL) &&
556 		    (req->state != RCM_STATE_SUSPEND)) {
557 			rcm_log_message(RCM_WARNING,
558 			    gettext("%s: invalid suspending from state %d\n"),
559 			    device, req->state);
560 			goto failure;
561 		}
562 		break;
563 
564 	case RCM_STATE_RESUMING:
565 		if ((req->state != RCM_STATE_SUSPEND) &&
566 		    (req->state != RCM_STATE_SUSPEND_QUERYING) &&
567 		    (req->state != RCM_STATE_SUSPEND_QUERY) &&
568 		    (req->state != RCM_STATE_SUSPEND_QUERY_FAIL) &&
569 		    (req->state != RCM_STATE_SUSPEND_FAIL)) {
570 			rcm_log_message(RCM_DEBUG,
571 			    "%s: invalid resuming from state %d\n",
572 			    device, req->state);
573 			goto failure;
574 		}
575 		break;
576 
577 	case RCM_STATE_ONLINING:
578 		if ((req->state != RCM_STATE_OFFLINE) &&
579 		    (req->state != RCM_STATE_OFFLINE_QUERYING) &&
580 		    (req->state != RCM_STATE_OFFLINE_QUERY) &&
581 		    (req->state != RCM_STATE_OFFLINE_QUERY_FAIL) &&
582 		    (req->state != RCM_STATE_OFFLINE_FAIL)) {
583 			rcm_log_message(RCM_INFO,
584 			    gettext("%s: invalid onlining from state %d\n"),
585 			    device, req->state);
586 			goto failure;
587 		}
588 		break;
589 
590 	case RCM_STATE_REMOVING:
591 		if ((req->state != RCM_STATE_OFFLINE) &&
592 		    (req->state != RCM_STATE_OFFLINE_FAIL)) {
593 			rcm_log_message(RCM_INFO,
594 			    gettext("%s: invalid removing from state %d\n"),
595 			    device, req->state);
596 			goto failure;
597 		}
598 		break;
599 
600 	case RCM_STATE_SUSPEND_FAIL:
601 		assert(req->state == RCM_STATE_SUSPENDING);
602 		break;
603 
604 	case RCM_STATE_OFFLINE_FAIL:
605 		assert(req->state == RCM_STATE_OFFLINING);
606 		break;
607 
608 	case RCM_STATE_SUSPEND:
609 		assert(req->state == RCM_STATE_SUSPENDING);
610 		break;
611 
612 	case RCM_STATE_OFFLINE:
613 		assert(req->state == RCM_STATE_OFFLINING);
614 		break;
615 
616 	case RCM_STATE_ONLINE:
617 		assert((req->state == RCM_STATE_RESUMING) ||
618 		    (req->state == RCM_STATE_ONLINING));
619 		break;
620 
621 	default:	/* shouldn't be here */
622 		rcm_log_message(RCM_ERROR,
623 		    gettext("invalid update to dr state: %d\n"), state);
624 		return (RCM_FAILURE);
625 	}
626 
627 update:
628 	/*
629 	 * update the state, interval, and sequence number; sync state file
630 	 */
631 	req->state = state;
632 	req->seq_num = seq_num;
633 
634 	if (interval)
635 		req->interval = *interval;
636 	else
637 		bzero(&req->interval, sizeof (timespec_t));
638 
639 	(void) fsync(state_fd);
640 	return (RCM_SUCCESS);
641 
642 failure:
643 	if (infop != NULL) {
644 		add_busy_rsrc_to_list(req->device, req->pid, req->state,
645 		    req->seq_num, NULL, locked_info, locked_err, NULL, infop);
646 	}
647 
648 	/*
649 	 * A request may be left in a transition state because the operator
650 	 * typed ctrl-C. In this case, the daemon thread continues to run
651 	 * and will eventually put the state in a non-transitional state.
652 	 *
653 	 * To be safe, we return EAGAIN to allow librcm to loop and retry.
654 	 * If we are called from a module, loop & retry could result in a
655 	 * deadlock. The called will check for this case and turn EAGAIN
656 	 * into RCM_CONFLICT.
657 	 */
658 	if (transition_state(req->state)) {
659 		return (EAGAIN);
660 	}
661 
662 	return (RCM_CONFLICT);
663 }
664 
665 /*
666  * Insert a dr entry in dr_req_list
667  */
668 int
669 dr_req_add(char *rsrcname, pid_t pid, uint_t flag, int state, int seq_num,
670     timespec_t *interval, rcm_info_t **info)
671 {
672 	int error;
673 	char *device;
674 	req_t *req;
675 
676 	rcm_log_message(RCM_TRACE3, "dr_req_add(%s, %ld, 0x%x, %d, %d, %p)\n",
677 	    rsrcname, pid, flag, state, seq_num, (void *)info);
678 
679 	device = resolve_name(rsrcname);
680 	if (device == NULL)
681 		return (EINVAL);
682 
683 	(void) mutex_lock(&rcm_req_lock);
684 
685 	/*
686 	 * In the re-offline/suspend case, attempt to update dr request.
687 	 *
688 	 * If this succeeds, return success;
689 	 * If this fails because of a conflict, return error;
690 	 * If this this fails because no entry exists, add a new entry.
691 	 */
692 	error = dr_req_update_entry(device, pid, flag, state, seq_num, interval,
693 	    info);
694 
695 	switch (error) {
696 	case RCM_FAILURE:
697 		/* proceed to add a new entry */
698 		break;
699 
700 	case RCM_CONFLICT:
701 	case RCM_SUCCESS:
702 	case EAGAIN:
703 	default:
704 		goto out;
705 	}
706 
707 	/*
708 	 * Check for lock conflicts
709 	 */
710 	error = check_lock(device, flag, LOCK_FOR_DR, info);
711 	if (error != RCM_SUCCESS) {
712 		error = RCM_CONFLICT;
713 		goto out;
714 	}
715 
716 	/*
717 	 * Get empty request entry, fill in values and sync state file
718 	 */
719 	req = get_req_entry(&dr_req_list);
720 
721 	req->seq_num = seq_num;
722 	req->pid = pid;
723 	req->flag = flag;
724 	req->state = state;
725 	req->type = rsrc_get_type(device);
726 	(void) strcpy(req->device, device);
727 
728 	/* cache interval for failure recovery */
729 	if (interval)
730 		req->interval = *interval;
731 	else
732 		bzero(&req->interval, sizeof (timespec_t));
733 
734 	(void) fsync(state_fd);
735 
736 	/*
737 	 * Add initiator pid to polling list
738 	 */
739 	add_to_polling_list(req->pid);
740 
741 out:
742 	(void) mutex_unlock(&rcm_req_lock);
743 	free(device);
744 
745 	return (error);
746 }
747 
748 /*
749  * Update a dr entry in dr_req_list
750  */
751 /*ARGSUSED*/
752 int
753 dr_req_update(char *rsrcname, pid_t pid, uint_t flag, int state, int seq_num,
754     rcm_info_t **info)
755 {
756 	int error;
757 	char *device = resolve_name(rsrcname);
758 
759 	rcm_log_message(RCM_TRACE3, "dr_req_update(%s, %ld, 0x%x, %d, %d)\n",
760 	    rsrcname, pid, flag, state, seq_num);
761 
762 	(void) mutex_lock(&rcm_req_lock);
763 	error = dr_req_update_entry(device, pid, flag, state, seq_num, NULL,
764 	    info);
765 	(void) mutex_unlock(&rcm_req_lock);
766 	free(device);
767 
768 	return (error);
769 }
770 
771 /*
772  * This function scans the DR request list for the next, non-removed
773  * entry that is part of the specified sequence.  The 'device' name
774  * of the entry is copied into the provided 'rsrc' buffer.
775  *
776  * The 'rsrc' buffer is required because the DR request list is only
777  * locked during the duration of this lookup.  Giving a direct pointer
778  * to something in the list would be unsafe.
779  */
780 int
781 dr_req_lookup(int seq_num, char *rsrc)
782 {
783 	int	i;
784 	int	len;
785 	int	base = (seq_num >> SEQ_NUM_SHIFT);
786 	int	retval = RCM_FAILURE;
787 
788 	if (rsrc == NULL) {
789 		return (RCM_FAILURE);
790 	}
791 
792 	(void) mutex_lock(&rcm_req_lock);
793 
794 	for (i = 0; i < dr_req_list->n_req_max; i++) {
795 
796 		/* Skip removed or non-matching entries */
797 		if ((dr_req_list->req[i].state == RCM_STATE_REMOVE) ||
798 		    ((dr_req_list->req[i].seq_num >> SEQ_NUM_SHIFT) != base)) {
799 			continue;
800 		}
801 
802 		/* Copy the next-matching 'device' name into 'rsrc' */
803 		len = strlcpy(rsrc, dr_req_list->req[i].device, MAXPATHLEN);
804 		if (len < MAXPATHLEN) {
805 			retval = RCM_SUCCESS;
806 		}
807 		break;
808 	}
809 
810 	(void) mutex_unlock(&rcm_req_lock);
811 
812 	return (retval);
813 }
814 
815 /*
816  * Remove a dr entry in dr_req_list
817  */
818 void
819 dr_req_remove(char *rsrcname, uint_t flag)
820 {
821 	req_t *req;
822 	char *device = resolve_name(rsrcname);
823 
824 	rcm_log_message(RCM_TRACE3, "dr_req_remove(%s)\n", rsrcname);
825 
826 	(void) mutex_lock(&rcm_req_lock);
827 
828 	/* find entry */
829 	req = find_req_entry(device, flag, -1, dr_req_list);
830 	free(device);
831 
832 	if (req == NULL) {
833 		(void) mutex_unlock(&rcm_req_lock);
834 		rcm_log_message(RCM_WARNING,
835 		    gettext("dr_req entry %s not found\n"), rsrcname);
836 		return;
837 	}
838 
839 	req->state = RCM_STATE_REMOVE;
840 	dr_req_list->n_req--;
841 	(void) fsync(state_fd);
842 
843 	/*
844 	 * remove pid from polling list
845 	 */
846 	remove_from_polling_list(req->pid);
847 
848 	/*
849 	 * We don't shrink the dr_req_list size for now.
850 	 * Shouldn't cause big memory leaks.
851 	 */
852 	(void) mutex_unlock(&rcm_req_lock);
853 }
854 
855 /*
856  * Return the list of ongoing dr operation requests
857  */
858 rcm_info_t *
859 rsrc_dr_info()
860 {
861 	int i;
862 	rcm_info_t *info;
863 	rcm_info_t *result = NULL;
864 	char *rsrc;
865 	int len;
866 
867 	rcm_log_message(RCM_TRACE2, "rsrc_dr_info()\n");
868 
869 	(void) mutex_lock(&rcm_req_lock);
870 	for (i = 0; i < dr_req_list->n_req_max; i++) {
871 		if (dr_req_list->req[i].state == RCM_STATE_REMOVE)
872 			continue;
873 
874 		if (dr_req_list->req[i].device[0] == '\0')
875 			continue;
876 
877 		if (dr_req_list->req[i].flag & RCM_FILESYS) {
878 			len = strlen(dr_req_list->req[i].device) + 5;
879 			rsrc = s_malloc(len);
880 			(void) snprintf(rsrc, len, "%s(fs)",
881 			    dr_req_list->req[i].device);
882 		} else {
883 			rsrc = s_strdup(dr_req_list->req[i].device);
884 		}
885 
886 		info = s_calloc(1, sizeof (*info));
887 		if (errno = nvlist_alloc(&(info->info), NV_UNIQUE_NAME, 0)) {
888 			rcm_log_message(RCM_ERROR,
889 			    gettext("failed (nvlist_alloc=%s).\n"),
890 			    strerror(errno));
891 			rcmd_exit(errno);
892 		}
893 
894 		if (errno = nvlist_add_string(info->info, RCM_RSRCNAME, rsrc)) {
895 			rcm_log_message(RCM_ERROR,
896 			    gettext("failed (nvlist_add=%s).\n"),
897 			    strerror(errno));
898 			rcmd_exit(errno);
899 		}
900 		(void) free(rsrc);
901 
902 		if (errno = nvlist_add_int64(info->info, RCM_CLIENT_ID,
903 		    dr_req_list->req[i].pid)) {
904 			rcm_log_message(RCM_ERROR,
905 			    gettext("failed (nvlist_add=%s).\n"),
906 			    strerror(errno));
907 			rcmd_exit(errno);
908 		}
909 
910 		if (errno = nvlist_add_int32(info->info, RCM_SEQ_NUM,
911 		    dr_req_list->req[i].seq_num)) {
912 			rcm_log_message(RCM_ERROR,
913 			    gettext("failed (nvlist_add=%s).\n"),
914 			    strerror(errno));
915 			rcmd_exit(errno);
916 		}
917 
918 		if (errno = nvlist_add_int32(info->info, RCM_RSRCSTATE,
919 		    dr_req_list->req[i].state)) {
920 			rcm_log_message(RCM_ERROR,
921 			    gettext("failed (nvlist_add=%s).\n"),
922 			    strerror(errno));
923 			rcmd_exit(errno);
924 		}
925 
926 		if (errno = nvlist_add_string(info->info, RCM_CLIENT_INFO,
927 		    (char *)locked_info)) {
928 			rcm_log_message(RCM_ERROR,
929 			    gettext("failed (nvlist_add=%s).\n"),
930 			    strerror(errno));
931 			rcmd_exit(errno);
932 		}
933 
934 		info->next = result;
935 		result = info;
936 	}
937 	(void) mutex_unlock(&rcm_req_lock);
938 
939 	return (result);
940 }
941 
942 /*
943  * Eliminate entries whose dr initiator is no longer running
944  * and recover daemon state during daemon restart.
945  *
946  * This routine is called from either during daemon initialization
947  * after all modules have registered resources or from the cleanup
948  * thread. In either case, it is the only thread running in the
949  * daemon.
950  */
951 void
952 clean_dr_list()
953 {
954 	int i;
955 	struct clean_list {
956 		struct clean_list *next;
957 		char *rsrcname;
958 		pid_t pid;
959 		int seq_num;
960 		int state;
961 		timespec_t interval;
962 	} *tmp, *list = NULL;
963 	char *rsrcnames[2];
964 
965 	rcm_log_message(RCM_TRACE3,
966 	    "clean_dr_list(): look for stale dr initiators\n");
967 
968 	rsrcnames[1] = NULL;
969 
970 	/*
971 	 * Make a list of entries to recover. This is necessary because
972 	 * the recovery operation will modify dr_req_list.
973 	 */
974 	(void) mutex_lock(&rcm_req_lock);
975 	for (i = 0; i < dr_req_list->n_req_max; i++) {
976 		/* skip empty entries */
977 		if (dr_req_list->req[i].state == RCM_STATE_REMOVE)
978 			continue;
979 
980 		if (dr_req_list->req[i].device[0] == '\0')
981 			continue;
982 
983 		/* skip cascade operations */
984 		if (dr_req_list->req[i].seq_num & SEQ_NUM_MASK)
985 			continue;
986 
987 		/*
988 		 * In the cleanup case, ignore entries with initiators alive
989 		 */
990 		if ((rcmd_get_state() == RCMD_CLEANUP) &&
991 		    proc_exist(dr_req_list->req[i].pid))
992 			continue;
993 
994 		rcm_log_message(RCM_TRACE1,
995 		    "found stale entry: %s\n", dr_req_list->req[i].device);
996 
997 		tmp = s_malloc(sizeof (*tmp));
998 		tmp->rsrcname = s_strdup(dr_req_list->req[i].device);
999 		tmp->state = dr_req_list->req[i].state;
1000 		tmp->pid = dr_req_list->req[i].pid;
1001 		tmp->seq_num = dr_req_list->req[i].seq_num;
1002 		tmp->interval = dr_req_list->req[i].interval;
1003 		tmp->next = list;
1004 		list = tmp;
1005 	}
1006 	(void) mutex_unlock(&rcm_req_lock);
1007 
1008 	if (list == NULL)
1009 		return;
1010 
1011 	/*
1012 	 * If everything worked normally, we shouldn't be here.
1013 	 * Since we are here, something went wrong, so say something.
1014 	 */
1015 	if (rcmd_get_state() == RCMD_INIT) {
1016 		rcm_log_message(RCM_NOTICE, gettext("rcm_daemon died "
1017 		    "unexpectedly, recovering previous daemon state\n"));
1018 	} else {
1019 		rcm_log_message(RCM_INFO, gettext("one or more dr initiator "
1020 		    "died, attempting automatic recovery\n"));
1021 	}
1022 
1023 	while (list) {
1024 		tmp = list;
1025 		list = tmp->next;
1026 
1027 		switch (tmp->state) {
1028 		case RCM_STATE_OFFLINE_QUERY:
1029 		case RCM_STATE_OFFLINE_QUERY_FAIL:
1030 			rsrcnames[0] = tmp->rsrcname;
1031 			if (proc_exist(tmp->pid)) {
1032 				/* redo */
1033 				(void) process_resource_offline(rsrcnames,
1034 				    tmp->pid, RCM_QUERY, tmp->seq_num, NULL);
1035 			} else {
1036 				/* undo */
1037 				(void) notify_resource_online(rsrcnames,
1038 				    tmp->pid, 0, tmp->seq_num, NULL);
1039 			}
1040 			break;
1041 
1042 		case RCM_STATE_OFFLINE:
1043 		case RCM_STATE_OFFLINE_FAIL:
1044 			rsrcnames[0] = tmp->rsrcname;
1045 			if (proc_exist(tmp->pid)) {
1046 				/* redo */
1047 				(void) process_resource_offline(rsrcnames,
1048 				    tmp->pid, 0, tmp->seq_num, NULL);
1049 			} else {
1050 				/* undo */
1051 				(void) notify_resource_online(rsrcnames,
1052 				    tmp->pid, 0, tmp->seq_num, NULL);
1053 			}
1054 			break;
1055 
1056 		case RCM_STATE_SUSPEND_QUERY:
1057 		case RCM_STATE_SUSPEND_QUERY_FAIL:
1058 			rsrcnames[0] = tmp->rsrcname;
1059 			if (proc_exist(tmp->pid)) {
1060 				/* redo */
1061 				(void) process_resource_suspend(rsrcnames,
1062 				    tmp->pid, RCM_QUERY, tmp->seq_num,
1063 				    &tmp->interval, NULL);
1064 			} else {
1065 				/* undo */
1066 				(void) notify_resource_resume(rsrcnames,
1067 				    tmp->pid, 0, tmp->seq_num, NULL);
1068 			}
1069 			break;
1070 
1071 		case RCM_STATE_SUSPEND:
1072 		case RCM_STATE_SUSPEND_FAIL:
1073 			rsrcnames[0] = tmp->rsrcname;
1074 			if (proc_exist(tmp->pid)) {
1075 				/* redo */
1076 				(void) process_resource_suspend(rsrcnames,
1077 				    tmp->pid, 0, tmp->seq_num, &tmp->interval,
1078 				    NULL);
1079 			} else {
1080 				/* undo */
1081 				(void) notify_resource_resume(rsrcnames,
1082 				    tmp->pid, 0, tmp->seq_num, NULL);
1083 			}
1084 			break;
1085 
1086 		case RCM_STATE_OFFLINING:
1087 		case RCM_STATE_ONLINING:
1088 			rsrcnames[0] = tmp->rsrcname;
1089 			(void) notify_resource_online(rsrcnames, tmp->pid, 0,
1090 			    tmp->seq_num, NULL);
1091 			break;
1092 
1093 		case RCM_STATE_SUSPENDING:
1094 		case RCM_STATE_RESUMING:
1095 			rsrcnames[0] = tmp->rsrcname;
1096 			(void) notify_resource_resume(rsrcnames, tmp->pid, 0,
1097 			    tmp->seq_num, NULL);
1098 			break;
1099 
1100 		case RCM_STATE_REMOVING:
1101 			rsrcnames[0] = tmp->rsrcname;
1102 			(void) notify_resource_remove(rsrcnames, tmp->pid, 0,
1103 			    tmp->seq_num, NULL);
1104 			break;
1105 
1106 		default:
1107 			rcm_log_message(RCM_WARNING,
1108 			    gettext("%s in unknown state %d\n"),
1109 			    tmp->rsrcname, tmp->state);
1110 			break;
1111 		}
1112 		free(tmp->rsrcname);
1113 		free(tmp);
1114 	}
1115 }
1116 
1117 /*
1118  * Selected thread blocking based on event type
1119  */
1120 barrier_t barrier;
1121 
1122 /*
1123  * Change barrier state:
1124  *	RCMD_INIT - daemon is intializing, only register allowed
1125  *	RCMD_NORMAL - normal daemon processing
1126  *	RCMD_CLEANUP - cleanup thread is waiting or running
1127  */
1128 int
1129 rcmd_get_state()
1130 {
1131 	return (barrier.state);
1132 }
1133 
1134 void
1135 rcmd_set_state(int state)
1136 {
1137 	/*
1138 	 * The state transition is as follows:
1139 	 *	INIT --> NORMAL <---> CLEANUP
1140 	 * The implementation favors the cleanup thread
1141 	 */
1142 
1143 	(void) mutex_lock(&barrier.lock);
1144 	barrier.state = state;
1145 
1146 	switch (state) {
1147 	case RCMD_CLEANUP:
1148 		/*
1149 		 * Wait for existing threads to exit
1150 		 */
1151 		barrier.wanted++;
1152 		while (barrier.thr_count != 0)
1153 			(void) cond_wait(&barrier.cv, &barrier.lock);
1154 		barrier.wanted--;
1155 		barrier.thr_count = -1;
1156 		break;
1157 
1158 	case RCMD_INIT:
1159 	case RCMD_NORMAL:
1160 	default:
1161 		if (barrier.thr_count == -1)
1162 			barrier.thr_count = 0;
1163 		if (barrier.wanted)
1164 			(void) cond_broadcast(&barrier.cv);
1165 		break;
1166 	}
1167 
1168 	(void) mutex_unlock(&barrier.lock);
1169 }
1170 
1171 /*
1172  * Increment daemon thread count
1173  */
1174 int
1175 rcmd_thr_incr(int cmd)
1176 {
1177 	int seq_num;
1178 
1179 	(void) mutex_lock(&barrier.lock);
1180 	/*
1181 	 * Set wanted flag
1182 	 */
1183 	barrier.wanted++;
1184 
1185 	/*
1186 	 * Wait till it is safe for daemon to perform the operation
1187 	 *
1188 	 * NOTE: if a module registers by passing a request to the
1189 	 *	client proccess, we may need to allow register
1190 	 *	to come through during daemon initialization.
1191 	 */
1192 	while (barrier.state != RCMD_NORMAL)
1193 		(void) cond_wait(&barrier.cv, &barrier.lock);
1194 
1195 	if ((cmd == CMD_EVENT) ||
1196 	    (cmd == CMD_REGISTER) ||
1197 	    (cmd == CMD_UNREGISTER)) {
1198 		/*
1199 		 * Event passthru and register ops don't need sequence number
1200 		 */
1201 		seq_num = -1;
1202 	} else {
1203 		/*
1204 		 * Non register operation gets a sequence number
1205 		 */
1206 		seq_num = get_seq_number();
1207 	}
1208 	barrier.wanted--;
1209 	barrier.thr_count++;
1210 	(void) mutex_unlock(&barrier.lock);
1211 
1212 	if ((cmd == CMD_OFFLINE) ||
1213 	    (cmd == CMD_SUSPEND) ||
1214 	    (cmd == CMD_GETINFO)) {
1215 		/*
1216 		 * For these operations, need to ask modules to
1217 		 * register any new resources that came online.
1218 		 *
1219 		 * This is because mount/umount are not instrumented
1220 		 * to register with rcm before using system resources.
1221 		 * Certain registration ops may fail during sync, which
1222 		 * indicates race conditions. This cannot be avoided
1223 		 * without changing mount/umount.
1224 		 */
1225 		rcmd_db_sync();
1226 	}
1227 
1228 	return (seq_num);
1229 }
1230 
1231 /*
1232  * Decrement thread count
1233  */
1234 void
1235 rcmd_thr_decr()
1236 {
1237 	/*
1238 	 * Decrement thread count and wake up reload/cleanup thread.
1239 	 */
1240 	(void) mutex_lock(&barrier.lock);
1241 	barrier.last_update = time(NULL);
1242 	if (--barrier.thr_count == 0)
1243 		(void) cond_broadcast(&barrier.cv);
1244 	(void) mutex_unlock(&barrier.lock);
1245 }
1246 
1247 /*
1248  * Wakeup all waiting threads as a result of SIGHUP
1249  */
1250 static int sighup_received = 0;
1251 
1252 void
1253 rcmd_thr_signal()
1254 {
1255 	(void) mutex_lock(&barrier.lock);
1256 	sighup_received = 1;
1257 	(void) cond_broadcast(&barrier.cv);
1258 	(void) mutex_unlock(&barrier.lock);
1259 }
1260 
1261 void
1262 rcmd_start_timer(int timeout)
1263 {
1264 	timestruc_t abstime;
1265 
1266 	if (timeout == 0)
1267 		timeout = RCM_DAEMON_TIMEOUT;	/* default to 5 minutes */
1268 	else
1269 		dr_req_list->idle_timeout = timeout;	/* persist timeout */
1270 
1271 	if (timeout > 0) {
1272 		abstime.tv_sec = time(NULL) + timeout;
1273 	}
1274 
1275 	(void) mutex_lock(&barrier.lock);
1276 	for (;;) {
1277 		int idletime;
1278 		int is_active;
1279 
1280 		if (timeout > 0)
1281 			(void) cond_timedwait(&barrier.cv, &barrier.lock,
1282 			    &abstime);
1283 		else
1284 			(void) cond_wait(&barrier.cv, &barrier.lock);
1285 
1286 		/*
1287 		 * If sighup received, change timeout to 0 so the daemon is
1288 		 * shut down at the first possible moment
1289 		 */
1290 		if (sighup_received)
1291 			timeout = 0;
1292 
1293 		/*
1294 		 * If timeout is negative, never shutdown the daemon
1295 		 */
1296 		if (timeout < 0)
1297 			continue;
1298 
1299 		/*
1300 		 * Check for ongoing/pending activity
1301 		 */
1302 		is_active = (barrier.thr_count || barrier.wanted ||
1303 		    (dr_req_list->n_req != 0));
1304 		if (is_active) {
1305 			abstime.tv_sec = time(NULL) + timeout;
1306 			continue;
1307 		}
1308 
1309 		/*
1310 		 * If idletime is less than timeout, continue to wait
1311 		 */
1312 		idletime = time(NULL) - barrier.last_update;
1313 		if (idletime < timeout) {
1314 			abstime.tv_sec = barrier.last_update + timeout;
1315 			continue;
1316 		}
1317 		break;
1318 	}
1319 
1320 	(void) script_main_fini();
1321 
1322 	rcm_log_message(RCM_INFO, gettext("rcm_daemon is shut down.\n"));
1323 }
1324 
1325 /*
1326  * Code related to polling client pid's
1327  * Not declared as static so that we can find this structure easily
1328  * in the core file.
1329  */
1330 struct {
1331 	int		n_pids;
1332 	int		n_max_pids;
1333 	thread_t	poll_tid;	/* poll thread id */
1334 	int		signaled;
1335 	pid_t		*pids;
1336 	int		*refcnt;
1337 	struct pollfd	*fds;
1338 	cond_t		cv;	/* the associated lock is rcm_req_lock */
1339 } polllist;
1340 
1341 static int
1342 find_pid_index(pid_t pid)
1343 {
1344 	int i;
1345 
1346 	for (i = 0; i < polllist.n_pids; i++) {
1347 		if (polllist.pids[i] == pid) {
1348 			return (i);
1349 		}
1350 	}
1351 	return (-1);
1352 }
1353 
1354 /*
1355  * Resize buffer for new pids
1356  */
1357 static int
1358 get_pid_index()
1359 {
1360 	const int n_chunk = 10;
1361 
1362 	int n_max;
1363 	int index = polllist.n_pids;
1364 
1365 	if (polllist.n_pids < polllist.n_max_pids) {
1366 		polllist.n_pids++;
1367 		return (index);
1368 	}
1369 
1370 	if (polllist.n_max_pids == 0) {
1371 		n_max = n_chunk;
1372 		polllist.pids = s_calloc(n_max, sizeof (pid_t));
1373 		polllist.refcnt = s_calloc(n_max, sizeof (int));
1374 		polllist.fds = s_calloc(n_max, sizeof (struct pollfd));
1375 	} else {
1376 		n_max = polllist.n_max_pids + n_chunk;
1377 		polllist.pids = s_realloc(polllist.pids,
1378 		    n_max * sizeof (pid_t));
1379 		polllist.refcnt = s_realloc(polllist.refcnt,
1380 		    n_max * sizeof (int));
1381 		polllist.fds = s_realloc(polllist.fds,
1382 		    n_max * sizeof (struct pollfd));
1383 	}
1384 	polllist.n_max_pids = n_max;
1385 	polllist.n_pids++;
1386 	return (index);
1387 }
1388 
1389 /*
1390  * rcm_req_lock must be held
1391  */
1392 static void
1393 add_to_polling_list(pid_t pid)
1394 {
1395 	int fd, index;
1396 	char procfile[MAXPATHLEN];
1397 
1398 	if (pid == (pid_t)0)
1399 		return;
1400 
1401 	rcm_log_message(RCM_TRACE1, "add_to_polling_list(%ld)\n", pid);
1402 
1403 	/*
1404 	 * Need to stop the poll thread before manipulating the polllist
1405 	 * since poll thread may possibly be using polllist.fds[] and
1406 	 * polllist.n_pids. As an optimization, first check if the pid
1407 	 * is already in the polllist. If it is, there is no need to
1408 	 * stop the poll thread. Just increment the pid reference count
1409 	 * and return;
1410 	 */
1411 	index = find_pid_index(pid);
1412 	if (index != -1) {
1413 		polllist.refcnt[index]++;
1414 		return;
1415 	}
1416 
1417 	stop_polling_thread();
1418 
1419 	/*
1420 	 * In an attempt to stop the poll thread we may have released
1421 	 * and reacquired rcm_req_lock. So find the index again.
1422 	 */
1423 	index = find_pid_index(pid);
1424 	if (index != -1) {
1425 		polllist.refcnt[index]++;
1426 		goto done;
1427 	}
1428 
1429 	/*
1430 	 * Open a /proc file
1431 	 */
1432 	(void) sprintf(procfile, "/proc/%ld/as", pid);
1433 	if ((fd = open(procfile, O_RDONLY)) == -1) {
1434 		rcm_log_message(RCM_NOTICE, gettext("open(%s): %s\n"),
1435 		    procfile, strerror(errno));
1436 		goto done;
1437 	}
1438 
1439 	/*
1440 	 * add pid to polllist
1441 	 */
1442 	index = get_pid_index();
1443 	polllist.pids[index] = pid;
1444 	polllist.refcnt[index] = 1;
1445 	polllist.fds[index].fd = fd;
1446 	polllist.fds[index].events = 0;
1447 	polllist.fds[index].revents = 0;
1448 
1449 	rcm_log_message(RCM_DEBUG, "add pid %ld at index %ld\n", pid, index);
1450 
1451 done:
1452 	start_polling_thread();
1453 }
1454 
1455 /*
1456  * rcm_req_lock must be held
1457  */
1458 static void
1459 remove_from_polling_list(pid_t pid)
1460 {
1461 	int i, index;
1462 
1463 	if (pid == (pid_t)0)
1464 		return;
1465 
1466 	rcm_log_message(RCM_TRACE1, "remove_from_polling_list(%ld)\n", pid);
1467 
1468 	/*
1469 	 * Need to stop the poll thread before manipulating the polllist
1470 	 * since poll thread may possibly be using polllist.fds[] and
1471 	 * polllist.n_pids. As an optimization, first check the pid
1472 	 * reference count. If the pid reference count is greater than 1
1473 	 * there is no need to stop the polling thread.
1474 	 */
1475 
1476 	index = find_pid_index(pid);
1477 	if (index == -1) {
1478 		rcm_log_message(RCM_NOTICE,
1479 		    gettext("error removing pid %ld from polling list\n"), pid);
1480 		return;
1481 	}
1482 
1483 	/*
1484 	 * decrement the pid refcnt
1485 	 */
1486 	if (polllist.refcnt[index] > 1) {
1487 		polllist.refcnt[index]--;
1488 		return;
1489 	}
1490 
1491 	stop_polling_thread();
1492 
1493 	/*
1494 	 * In an attempt to stop the poll thread we may have released
1495 	 * and reacquired rcm_req_lock. So find the index again.
1496 	 */
1497 	index = find_pid_index(pid);
1498 	if (index == -1) {
1499 		rcm_log_message(RCM_NOTICE,
1500 		    gettext("error removing pid %ld from polling list\n"), pid);
1501 		goto done;
1502 	}
1503 
1504 	if (--polllist.refcnt[index] > 0)
1505 		goto done;
1506 
1507 	/*
1508 	 * refcnt down to zero, delete pid from polling list
1509 	 */
1510 	(void) close(polllist.fds[index].fd);
1511 	polllist.n_pids--;
1512 
1513 	for (i = index; i < polllist.n_pids; i++) {
1514 		polllist.pids[i] = polllist.pids[i + 1];
1515 		polllist.refcnt[i] = polllist.refcnt[i + 1];
1516 		bcopy(&polllist.fds[i + 1], &polllist.fds[i],
1517 		    sizeof (struct pollfd));
1518 	}
1519 
1520 	rcm_log_message(RCM_DEBUG, "remove pid %ld at index %d\n", pid, index);
1521 
1522 done:
1523 	start_polling_thread();
1524 }
1525 
1526 void
1527 init_poll_thread()
1528 {
1529 	polllist.poll_tid = (thread_t)-1;
1530 }
1531 
1532 void
1533 cleanup_poll_thread()
1534 {
1535 	(void) mutex_lock(&rcm_req_lock);
1536 	if (polllist.poll_tid == thr_self()) {
1537 		rcm_log_message(RCM_TRACE2,
1538 		    "cleanup_poll_thread: n_pids = %d\n", polllist.n_pids);
1539 		polllist.poll_tid = (thread_t)-1;
1540 		(void) cond_broadcast(&polllist.cv);
1541 	}
1542 	(void) mutex_unlock(&rcm_req_lock);
1543 }
1544 
1545 /*ARGSUSED*/
1546 static void *
1547 pollfunc(void *arg)
1548 {
1549 	sigset_t mask;
1550 
1551 	rcm_log_message(RCM_TRACE2, "poll thread started. n_pids = %d\n",
1552 	    polllist.n_pids);
1553 
1554 	/*
1555 	 * Unblock SIGUSR1 to allow polling thread to be killed
1556 	 */
1557 	(void) sigemptyset(&mask);
1558 	(void) sigaddset(&mask, SIGUSR1);
1559 	(void) thr_sigsetmask(SIG_UNBLOCK, &mask, NULL);
1560 
1561 	(void) poll(polllist.fds, polllist.n_pids, (time_t)-1);
1562 
1563 	/*
1564 	 * block SIGUSR1 to avoid being killed while holding a lock
1565 	 */
1566 	(void) sigemptyset(&mask);
1567 	(void) sigaddset(&mask, SIGUSR1);
1568 	(void) thr_sigsetmask(SIG_BLOCK, &mask, NULL);
1569 
1570 	rcm_log_message(RCM_TRACE2, "returned from poll()\n");
1571 
1572 	cleanup_poll_thread();
1573 
1574 	(void) mutex_lock(&barrier.lock);
1575 	need_cleanup = 1;
1576 	(void) cond_broadcast(&barrier.cv);
1577 	(void) mutex_unlock(&barrier.lock);
1578 
1579 	return (NULL);
1580 }
1581 
1582 /*
1583  * rcm_req_lock must be held
1584  */
1585 void
1586 start_polling_thread()
1587 {
1588 	int err;
1589 
1590 	if (rcmd_get_state() != RCMD_NORMAL)
1591 		return;
1592 
1593 	if (polllist.poll_tid != (thread_t)-1 || polllist.n_pids == 0)
1594 		return;
1595 
1596 	if ((err = thr_create(NULL, 0, pollfunc, NULL, THR_DETACHED,
1597 	    &polllist.poll_tid)) == 0)
1598 		polllist.signaled = 0;
1599 	else
1600 		rcm_log_message(RCM_ERROR,
1601 		    gettext("failed to create polling thread: %s\n"),
1602 		    strerror(err));
1603 }
1604 
1605 /*
1606  * rcm_req_lock must be held
1607  */
1608 static void
1609 stop_polling_thread()
1610 {
1611 	int err;
1612 
1613 	while (polllist.poll_tid != (thread_t)-1) {
1614 		if (polllist.signaled == 0) {
1615 			if ((err = thr_kill(polllist.poll_tid, SIGUSR1)) == 0)
1616 				polllist.signaled = 1;
1617 			else
1618 				/*
1619 				 * thr_kill shouldn't have failed since the
1620 				 * poll thread id and the signal are valid.
1621 				 * So log an error. Since when thr_kill
1622 				 * fails no signal is sent (as per man page),
1623 				 * the cond_wait below will wait until the
1624 				 * the poll thread exits by some other means.
1625 				 * The poll thread, for example, exits on its
1626 				 * own when any DR initiator process that it
1627 				 * is currently polling exits.
1628 				 */
1629 				rcm_log_message(RCM_ERROR,
1630 				    gettext(
1631 				    "fail to kill polling thread %d: %s\n"),
1632 				    polllist.poll_tid, strerror(err));
1633 		}
1634 		(void) cond_wait(&polllist.cv, &rcm_req_lock);
1635 	}
1636 }
1637