xref: /titanic_52/usr/src/cmd/avs/nsctl/nskernd.c (revision d22e11eb92a44ef85ea64989dbff7134a35829cc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/resource.h>
28 #include <sys/priocntl.h>
29 #include <sys/rtpriocntl.h>
30 #include <sys/tspriocntl.h>
31 #include <sys/wait.h>
32 #include <sys/stat.h>
33 
34 #include <strings.h>
35 #include <thread.h>
36 #include <stdlib.h>
37 #include <signal.h>
38 #include <errno.h>
39 #include <stdio.h>
40 #include <fcntl.h>
41 #include <locale.h>
42 #include <unistd.h>
43 #include <syslog.h>
44 
45 #include <sys/nsctl/cfg.h>
46 #include <sys/nsctl/nsctl.h>
47 #include <sys/nsctl/nsc_ioctl.h>
48 #include <sys/nskernd.h>
49 #include <nsctl.h>
50 
51 #include <sys/mkdev.h>
52 #include <sys/nsctl/sv_efi.h>
53 
54 static const char *rdev = "/dev/nsctl";
55 
56 /*
57  * Define a minimal user stack size in bytes over and above the
58  * libthread THR_STACK_MIN minimum value.
59  *
60  * This stack size needs to be sufficient to run _newlwp() and then
61  * ioctl() down into the kernel.
62  */
63 #define	NSK_STACK_SIZE	512
64 
65 /*
66  * LWP scheduling control switches.
67  *
68  * allow_pri	- set to non-zero to enable priocntl() manipulations of
69  *		created LWPs.
70  * allow_rt	- set to non-zero to use the RT rather than the TS
71  *		scheduling class when manipulating the schduling
72  *		parameters for an LWP.  Only used if allow_pri is
73  *		non-zero.
74  */
75 static int allow_pri = 1;
76 static int allow_rt = 0;	/* disallow - bad interactions with timeout() */
77 
78 static int nsctl_fd = -1;
79 static int sigterm;
80 
81 static int nthreads;		/* number of threads in the kernel */
82 static int exiting;		/* shutdown in progress flag */
83 static mutex_t thr_mutex = DEFAULTMUTEX;
84 static mutex_t cfg_mutex = DEFAULTMUTEX;
85 
86 static int cl_nodeid = -1;
87 
88 static int display_msg = 0;
89 static int delay_time = 30;
90 
91 static void
92 usage(void)
93 {
94 	fprintf(stderr, gettext("usage: nskernd\n"));
95 	exit(255);
96 }
97 
98 
99 static void
100 sighand(int sig)
101 {
102 	if (sig == SIGTERM) {
103 		sigterm++;
104 	}
105 }
106 
107 
108 /*
109  * Returns: 1 - can enter kernel; 0 - shutdown in progress, do not enter kernel
110  */
111 int
112 nthread_inc(void)
113 {
114 	mutex_lock(&thr_mutex);
115 	if (exiting) {
116 		/* cannot enter kernel as nskernd is being shutdown - exit */
117 		mutex_unlock(&thr_mutex);
118 		return (0);
119 	}
120 	nthreads++;
121 	mutex_unlock(&thr_mutex);
122 	return (1);
123 }
124 
125 
126 void
127 nthread_dec(void)
128 {
129 	mutex_lock(&thr_mutex);
130 	nthreads--;
131 	mutex_unlock(&thr_mutex);
132 }
133 
134 
135 /*
136  * returns: 1 - can shutdown; 0 - unable to shutdown
137  */
138 int
139 canshutdown(void)
140 {
141 	int rc = 1;
142 	time_t	start_delay;
143 
144 	mutex_lock(&thr_mutex);
145 	if (nthreads > 0) {
146 		if (display_msg) {
147 			fprintf(stderr,
148 			    gettext("nskernd: unable to shutdown: "
149 			    "%d kernel threads in use\n"), nthreads);
150 		}
151 		start_delay = time(0);
152 		while (nthreads > 0 && (time(0) - start_delay) < delay_time) {
153 			mutex_unlock(&thr_mutex);
154 			sleep(1);
155 			mutex_lock(&thr_mutex);
156 			fprintf(stderr,
157 			    gettext("nskernd:   delay shutdown: "
158 			    "%d kernel threads in use\n"), nthreads);
159 		}
160 		if (nthreads > 0) {
161 			rc = 0;
162 		} else {
163 			exiting = 1;
164 		}
165 	} else {
166 		/* flag shutdown in progress */
167 		exiting = 1;
168 	}
169 	mutex_unlock(&thr_mutex);
170 
171 	return (rc);
172 }
173 
174 
175 /*
176  * returns: 1 - shutdown successful; 0 - unable to shutdown
177  */
178 int
179 shutdown(void)
180 {
181 	struct nskernd data;
182 	int rc;
183 
184 	if (nsctl_fd < 0)
185 		return (1);
186 
187 	bzero(&data, sizeof (data));
188 	data.command = NSKERND_STOP;
189 
190 	if (!canshutdown()) {
191 		return (0);
192 	}
193 
194 	rc = ioctl(nsctl_fd, NSCIOC_NSKERND, &data);
195 	if (rc < 0) {
196 		if (errno != EINTR || !sigterm) {
197 			fprintf(stderr,
198 			    gettext("nskernd: NSKERND_STOP failed\n"));
199 		}
200 	}
201 
202 	return (1);
203 }
204 
205 
206 /*
207  * First function run by a NSKERND_NEWLWP thread.
208  *
209  * Determines if it needs to change the scheduling priority of the LWP,
210  * and then calls back into the kernel.
211  */
212 static void *
213 _newlwp(void *arg)
214 {
215 	struct nskernd nsk;
216 	pcparms_t pcparms;
217 	pcinfo_t pcinfo;
218 
219 	/* copy arguments onto stack and free heap memory */
220 	bcopy(arg, &nsk, sizeof (nsk));
221 	free(arg);
222 
223 	if (nsk.data2 && allow_pri) {
224 		/* increase the scheduling priority of this LWP */
225 
226 		bzero(&pcinfo, sizeof (pcinfo));
227 		strcpy(pcinfo.pc_clname, allow_rt ? "RT" : "TS");
228 
229 		if (priocntl(0, 0, PC_GETCID, (char *)&pcinfo) < 0) {
230 			fprintf(stderr,
231 				gettext(
232 				"nskernd: priocntl(PC_GETCID) failed: %s\n"),
233 				strerror(errno));
234 			goto pri_done;
235 		}
236 
237 		bzero(&pcparms, sizeof (pcparms));
238 		pcparms.pc_cid = pcinfo.pc_cid;
239 
240 		if (allow_rt) {
241 			((rtparms_t *)pcparms.pc_clparms)->rt_pri =
242 				(pri_t)0; /* minimum RT priority */
243 			((rtparms_t *)pcparms.pc_clparms)->rt_tqsecs =
244 				(uint_t)RT_TQDEF;
245 			((rtparms_t *)pcparms.pc_clparms)->rt_tqnsecs =
246 				RT_TQDEF;
247 		} else {
248 			((tsparms_t *)pcparms.pc_clparms)->ts_uprilim =
249 				((tsinfo_t *)&pcinfo.pc_clinfo)->ts_maxupri;
250 			((tsparms_t *)pcparms.pc_clparms)->ts_upri =
251 				((tsinfo_t *)&pcinfo.pc_clinfo)->ts_maxupri;
252 		}
253 
254 		if (priocntl(P_LWPID, P_MYID,
255 		    PC_SETPARMS, (char *)&pcparms) < 0) {
256 			fprintf(stderr,
257 				gettext(
258 				"nskernd: priocntl(PC_SETPARMS) failed: %s\n"),
259 				strerror(errno));
260 		}
261 	}
262 
263 pri_done:
264 	if (nthread_inc()) {
265 		(void) ioctl(nsctl_fd, NSCIOC_NSKERND, &nsk);
266 		nthread_dec();
267 	}
268 	return (NULL);
269 }
270 
271 
272 /*
273  * Start a new thread bound to an LWP.
274  *
275  * This is the user level side of nsc_create_process().
276  */
277 static void
278 newlwp(struct nskernd *req)
279 {
280 	struct nskernd *nskp;
281 	thread_t tid;
282 	int rc;
283 
284 	nskp = malloc(sizeof (*nskp));
285 	if (!nskp) {
286 #ifdef DEBUG
287 		fprintf(stderr, gettext("nskernd: malloc(%d) failed\n"),
288 			sizeof (*nskp));
289 #endif
290 		req->data1 = (uint64_t)ENOMEM;
291 		return;
292 	}
293 
294 	/* copy args for child */
295 	bcopy(req, nskp, sizeof (*nskp));
296 
297 	rc = thr_create(NULL, (THR_MIN_STACK + NSK_STACK_SIZE),
298 		_newlwp, nskp, THR_BOUND|THR_DETACHED, &tid);
299 
300 	if (rc != 0) {
301 		/* thr_create failed */
302 #ifdef DEBUG
303 		fprintf(stderr, gettext("nskernd: thr_create failed: %s\n"),
304 			strerror(errno));
305 #endif
306 		req->data1 = (uint64_t)errno;
307 		free(nskp);
308 	} else {
309 		/* success - _newlwp() will free nskp */
310 		req->data1 = (uint64_t)0;
311 	}
312 }
313 
314 static int
315 log_iibmp_err(char *set, int flags)
316 {
317 	CFGFILE *cfg;
318 	char key[CFG_MAX_KEY];
319 	char buf[CFG_MAX_BUF];
320 	char newflags[CFG_MAX_BUF];
321 	char outbuf[CFG_MAX_BUF];
322 	char *mst, *shd, *bmp, *mode, *ovr, *cnode, *opt, *grp;
323 	int setno, found = 0;
324 	int setlen;
325 	int rc = 0;
326 	pid_t pid = -1;
327 
328 	if (set && *set) {
329 		setlen = strlen(set);
330 	} else {
331 		return (EINVAL);
332 	}
333 
334 	mutex_lock(&cfg_mutex);
335 	cfg = cfg_open("");
336 	if (!cfg) {
337 		mutex_unlock(&cfg_mutex);
338 		return (ENXIO);
339 	}
340 
341 	if (!cfg_lock(cfg, CFG_WRLOCK)) {
342 
343 		mutex_unlock(&cfg_mutex);
344 		cfg_close(cfg);
345 
346 		pid = fork();
347 
348 		if (pid == -1) {
349 			fprintf(stderr, gettext(
350 			    "nskernd: Error forking\n"));
351 			return (errno);
352 		} else if (pid > 0) {
353 			fprintf(stdout, gettext(
354 			    "nskernd: Attempting deferred bitmap error\n"));
355 			return (0);
356 		}
357 
358 		mutex_lock(&cfg_mutex);
359 		cfg = cfg_open("");
360 		if (!cfg) {
361 			mutex_unlock(&cfg_mutex);
362 			fprintf(stderr, gettext(
363 			    "nskernd: Failed cfg_open, deferred bitmap\n"));
364 			return (ENXIO);
365 		}
366 
367 		/* Sooner or later, this lock will be free */
368 		while (!cfg_lock(cfg, CFG_WRLOCK))
369 			sleep(2);
370 	}
371 
372 	/* find the proper set number */
373 	for (setno = 1; !found; setno++) {
374 		snprintf(key, CFG_MAX_KEY, "ii.set%d", setno);
375 		if (cfg_get_cstring(cfg, key, buf, CFG_MAX_BUF) < 0) {
376 			break;
377 		}
378 
379 		mst = strtok(buf, " ");
380 		shd = strtok(NULL, " ");
381 		if (strncmp(shd, set, setlen) == 0) {
382 			found = 1;
383 
384 			bmp = strtok(NULL, " ");
385 			mode = strtok(NULL, " ");
386 			ovr = strtok(NULL, " ");
387 			cnode = strtok(NULL, " ");
388 			opt = strtok(NULL, " ");
389 			grp = strtok(NULL, " ");
390 			break;
391 		}
392 	}
393 
394 	if (found) {
395 		/* were there flags in the options field already? */
396 		snprintf(newflags, CFG_MAX_BUF, "%s=0x%x",
397 		    NSKERN_II_BMP_OPTION, flags);
398 		if (opt && strcmp(opt, "-") != 0) {
399 			bzero(newflags, CFG_MAX_BUF);
400 			opt = strtok(opt, ";");
401 			while (opt) {
402 				if (strncmp(opt, NSKERN_II_BMP_OPTION,
403 				    strlen(NSKERN_II_BMP_OPTION)) != 0) {
404 					strcat(newflags, ";");
405 					strcat(newflags, opt);
406 				}
407 			}
408 		}
409 		snprintf(key, CFG_MAX_KEY, "ii.set%d", setno);
410 		snprintf(outbuf, CFG_MAX_BUF, "%s %s %s %s %s %s %s %s",
411 			mst, shd, bmp, mode, ovr, cnode, newflags, grp);
412 		if (cfg_put_cstring(cfg, key, outbuf, CFG_MAX_BUF) < 0) {
413 			printf("Failed to put [%s]\n", outbuf);
414 			rc = ENXIO;
415 		} else {
416 			cfg_commit(cfg);
417 			rc = 0;
418 		}
419 	} else {
420 		fprintf(stderr, gettext(
421 			    "nskernd: Failed deferred bitmap [%s]\n"), set);
422 		rc = EINVAL;
423 	}
424 	cfg_unlock(cfg);
425 	cfg_close(cfg);
426 	mutex_unlock(&cfg_mutex);
427 
428 	/*
429 	 * if we are the fork'ed client, just exit, if parent just return
430 	 */
431 	if (pid == 0) {
432 		exit(rc);
433 		/*NOTREACHED*/
434 	} else {
435 		return (rc);
436 	}
437 }
438 
439 /*
440  * First function run by a NSKERND_LOCK thread.
441  *
442  * Opens dscfg and locks it,
443  * and then calls back into the kernel.
444  *
445  * Incoming:
446  *	data1 is the kernel address of the sync structure.
447  *	data2 is read(0)/write(1) lock mode.
448  *
449  * Returns:
450  *	data1 as incoming.
451  *	data2 errno.
452  */
453 static void *
454 _dolock(void *arg)
455 {
456 	struct nskernd nsk;
457 	CFGFILE *cfg;
458 	int locked;
459 	int mode;
460 	int rc = 0;
461 
462 	/* copy arguments onto stack and free heap memory */
463 	bcopy(arg, &nsk, sizeof (nsk));
464 	free(arg);
465 
466 	mutex_lock(&cfg_mutex);
467 	cfg = cfg_open("");
468 	if (cfg == NULL) {
469 #ifdef DEBUG
470 		fprintf(stderr, gettext("nskernd: cfg_open failed: %s\n"),
471 		    strerror(errno));
472 #endif
473 		rc = ENXIO;
474 	}
475 
476 	if (nsk.data2 == 0) {
477 		mode = CFG_RDLOCK;
478 	} else {
479 		mode = CFG_WRLOCK;
480 	}
481 
482 	locked = 0;
483 	if (rc == 0) {
484 		if (cfg_lock(cfg, mode)) {
485 			locked = 1;
486 		} else {
487 #ifdef DEBUG
488 			fprintf(stderr,
489 			    gettext("nskernd: cfg_lock failed: %s\n"),
490 			    strerror(errno));
491 #endif
492 			rc = EINVAL;
493 		}
494 	}
495 
496 	/* return to kernel */
497 
498 	nsk.data2 = (uint64_t)rc;
499 	if (nthread_inc()) {
500 		(void) ioctl(nsctl_fd, NSCIOC_NSKERND, &nsk);
501 		nthread_dec();
502 	}
503 
504 	/* cleanup */
505 
506 	if (locked) {
507 		cfg_unlock(cfg);
508 		locked = 0;
509 	}
510 
511 	if (cfg != NULL) {
512 		cfg_close(cfg);
513 		cfg = NULL;
514 	}
515 	mutex_unlock(&cfg_mutex);
516 
517 	return (NULL);
518 }
519 
520 
521 /*
522  * Inter-node lock thread.
523  *
524  * This is the user level side of nsc_rmlock().
525  */
526 static void
527 dolock(struct nskernd *req)
528 {
529 	struct nskernd *nskp;
530 	thread_t tid;
531 	int rc;
532 
533 	/* create a new thread to do the lock and return to kernel */
534 
535 	nskp = malloc(sizeof (*nskp));
536 	if (!nskp) {
537 #ifdef DEBUG
538 		fprintf(stderr, gettext("nskernd:dolock: malloc(%d) failed\n"),
539 		    sizeof (*nskp));
540 #endif
541 		req->data1 = (uint64_t)ENOMEM;
542 		return;
543 	}
544 
545 	/* copy args for child */
546 	bcopy(req, nskp, sizeof (*nskp));
547 
548 	rc = thr_create(NULL, (THR_MIN_STACK + NSK_STACK_SIZE),
549 	    _dolock, nskp, THR_BOUND|THR_DETACHED, &tid);
550 
551 	if (rc != 0) {
552 		/* thr_create failed */
553 #ifdef DEBUG
554 		fprintf(stderr, gettext("nskernd: thr_create failed: %s\n"),
555 		    strerror(errno));
556 #endif
557 		req->data1 = (uint64_t)errno;
558 		free(nskp);
559 	} else {
560 		/* success - _dolock() will free nskp */
561 		req->data1 = (uint64_t)0;
562 	}
563 }
564 
565 
566 /*
567  * Convenience code for engineering test of multi-terabyte volumes.
568  *
569  * zvol (part of zfs) does not support DKIOCPARTITION but does use EFI
570  * labels.  This code allocates a simple efi label structure and ioctls
571  * to extract the size of a zvol.  It only handles the minimal EFI ioctl
572  * implementation in zvol.
573  */
574 
575 static void
576 zvol_bsize(char *path, uint64_t *size, const int pnum)
577 {
578 	struct stat64 stb1, stb2;
579 	struct dk_minfo dkm;
580 	int fd = -1;
581 	int rc;
582 
583 	if (cl_nodeid || pnum != 0)
584 		return;
585 
586 	if ((fd = open(path, O_RDONLY)) < 0) {
587 		return;
588 	}
589 
590 	if (stat64("/devices/pseudo/zfs@0:zfs", &stb1) != 0 ||
591 	    fstat64(fd, &stb2) != 0 ||
592 	    !S_ISCHR(stb1.st_mode) ||
593 	    !S_ISCHR(stb2.st_mode) ||
594 	    major(stb1.st_rdev) != major(stb2.st_rdev)) {
595 		(void) close(fd);
596 		return;
597 	}
598 
599 	rc = ioctl(fd, DKIOCGMEDIAINFO, (void *)&dkm);
600 	if (rc >= 0) {
601 		*size = LE_64(dkm.dki_capacity) *
602 			(dkm.dki_lbsize) / 512;
603 	}
604 
605 	(void) close(fd);
606 }
607 
608 /* ARGSUSED */
609 static void
610 get_bsize(uint64_t raw_fd, uint64_t *size, int *partitionp, char *path)
611 {
612 	struct nscioc_bsize bsize;
613 #ifdef DKIOCPARTITION
614 	struct partition64 p64;
615 #endif
616 	struct dk_cinfo dki_info;
617 	struct vtoc vtoc;
618 	int fd;
619 
620 	*partitionp = -1;
621 	*size = (uint64_t)0;
622 
623 	dki_info.dki_partition = (ushort_t)-1;
624 	bsize.dki_info = (uint64_t)(unsigned long)&dki_info;
625 	bsize.vtoc = (uint64_t)(unsigned long)&vtoc;
626 	bsize.raw_fd = raw_fd;
627 	bsize.efi = 0;
628 
629 	fd = open(rdev, O_RDONLY);
630 	if (fd < 0)
631 		return;
632 
633 	if (ioctl(fd, NSCIOC_BSIZE, &bsize) < 0) {
634 		if (dki_info.dki_partition != (ushort_t)-1) {
635 			/* assume part# is ok and just the size failed */
636 			*partitionp = (int)dki_info.dki_partition;
637 
638 #ifdef DKIOCPARTITION
639 			/* see if this is an EFI label */
640 			bzero(&p64, sizeof (p64));
641 			p64.p_partno = (uint_t)*partitionp;
642 			if ((ioctl(fd, DKIOCPARTITION, &p64)) > 0) {
643 				*size = (uint64_t)p64.p_size;
644 			} else {
645 				bsize.p64 = (uint64_t)(unsigned long)&p64;
646 				bsize.efi = 1;
647 
648 				if (ioctl(fd, NSCIOC_BSIZE, &bsize) < 0) {
649 					/* see if this is a zvol */
650 					zvol_bsize(path, size, *partitionp);
651 				} else {
652 					*size = (uint64_t)p64.p_size;
653 				}
654 			}
655 #endif	/* DKIOCPARTITION */
656 		}
657 
658 		close(fd);
659 		return;
660 	}
661 
662 	close(fd);
663 
664 	*partitionp = (int)dki_info.dki_partition;
665 
666 	if (vtoc.v_sanity != VTOC_SANE)
667 		return;
668 
669 	if (vtoc.v_version != V_VERSION && vtoc.v_version != 0)
670 		return;
671 
672 	if (dki_info.dki_partition > V_NUMPAR)
673 		return;
674 
675 	*size = (uint64_t)vtoc.v_part[(int)dki_info.dki_partition].p_size;
676 }
677 
678 
679 static int
680 iscluster(void)
681 {
682 	/*
683 	 * Find out if we are running in a cluster
684 	 */
685 	cl_nodeid = cfg_iscluster();
686 	if (cl_nodeid > 0) {
687 		return (TRUE);
688 	} else if (cl_nodeid == 0) {
689 		return (FALSE);
690 	}
691 
692 	fprintf(stderr, "%s\n",
693 	    gettext("nskernd: unable to ascertain environment"));
694 	exit(1);
695 	/* NOTREACHED */
696 }
697 
698 /*
699  * Runtime Solaris release checking - build release == runtime release
700  * is always considered success, so only keep entries in the map for
701  * the special cases.
702  */
703 static nsc_release_t nskernd_rel_map[] = {
704 /*	{ "5.10", "5.10" },			*/
705 	{ "5.11", "5.10" },
706 	{ NULL, NULL }
707 };
708 
709 
710 #ifdef lint
711 #define	main	nskernd_main
712 #endif
713 /* ARGSUSED1 */
714 int
715 main(int argc, char *argv[])
716 {
717 	const char *dir = "/";
718 	struct nskernd data;
719 	struct rlimit rl;
720 	int i, run, rc;
721 	int partition;
722 	char *reqd;
723 	int syncpipe[2];
724 	int startup;
725 
726 	(void) setlocale(LC_ALL, "");
727 	(void) textdomain("nskernd");
728 
729 	rc = nsc_check_release(BUILD_REV_STR, nskernd_rel_map, &reqd);
730 	if (rc < 0) {
731 		fprintf(stderr,
732 		    gettext("nskernd: unable to determine the current "
733 		    "Solaris release: %s\n"), strerror(errno));
734 		exit(1);
735 	} else if (rc == FALSE) {
736 		fprintf(stderr,
737 		    gettext("nskernd: incorrect Solaris release "
738 		    "(requires %s)\n"), reqd);
739 		exit(1);
740 	}
741 
742 	rc = 0;
743 
744 	if (argc != 1)
745 		usage();
746 
747 	/*
748 	 * Usage: <progname> [-g] [-d <seconds to delay>]
749 	 */
750 	while ((i = getopt(argc, argv, "gd:")) != EOF) {
751 		switch (i) {
752 			case 'g':
753 				display_msg = 1;
754 				break;
755 			case 'd':
756 				delay_time = atoi(optarg);
757 				if (delay_time <= 0) {
758 					delay_time = 30;
759 				}
760 				break;
761 			default:
762 				syslog(LOG_ERR,
763 				"Usage: nskernd [-g] [-d <seconds to delay>]");
764 				exit(1);
765 				break;
766 		}
767 	}
768 
769 	if (chroot(dir) < 0) {
770 		fprintf(stderr, gettext("nskernd: chroot failed: %s\n"),
771 			strerror(errno));
772 		exit(1);
773 	}
774 
775 	if (chdir(dir) < 0) {
776 		fprintf(stderr, gettext("nskernd: chdir failed: %s\n"),
777 			strerror(errno));
778 		exit(1);
779 	}
780 
781 	/*
782 	 * Determine if we are in a Sun Cluster or not, before fork'ing
783 	 */
784 	(void) iscluster();
785 
786 	/*
787 	 * create a pipe to synchronise the parent with the
788 	 * child just before it enters its service loop.
789 	 */
790 	if (pipe(syncpipe) < 0) {
791 		fprintf(stderr, gettext("nskernd: cannot create pipe: %s\n"),
792 		    strerror(errno));
793 		exit(1);
794 	}
795 	/*
796 	 * Fork off a child that becomes the daemon.
797 	 */
798 
799 	if ((rc = fork()) > 0) {
800 		char c;
801 		int n;
802 		(void) close(syncpipe[1]);
803 		/*
804 		 * wait for the close of the pipe.
805 		 * If we get a char back, indicates good
806 		 * status from child, so exit 0.
807 		 * If we get a zero length read, then the
808 		 * child has failed, so we do too.
809 		 */
810 		n = read(syncpipe[0], &c, 1);
811 		exit((n <= 0) ? 1 : 0);
812 	} else if (rc < 0) {
813 		fprintf(stderr, gettext("nskernd: cannot fork: %s\n"),
814 			strerror(errno));
815 		exit(1);
816 	}
817 
818 	/*
819 	 * In child - become daemon.
820 	 */
821 
822 	/* use closefrom(3C) from PSARC/2000/193 when possible */
823 	for (i = 0; i < syncpipe[1]; i++) {
824 		(void) close(i);
825 	}
826 	closefrom(syncpipe[1] + 1);
827 
828 	(void) open("/dev/console", O_WRONLY|O_APPEND);
829 	(void) dup(0);
830 	(void) dup(0);
831 	(void) close(0);
832 
833 	setpgrp();
834 
835 	/*
836 	 * Ignore all signals apart from SIGTERM.
837 	 */
838 
839 	for (i = 1; i < _sys_nsig; i++)
840 		(void) sigset(i, SIG_IGN);
841 
842 	(void) sigset(SIGTERM, sighand);
843 
844 	/*
845 	 * Increase the number of fd's that can be open.
846 	 */
847 
848 	rl.rlim_cur = RLIM_INFINITY;
849 	rl.rlim_max = RLIM_INFINITY;
850 	if (setrlimit(RLIMIT_NOFILE, &rl) < 0) {
851 		fprintf(stderr,
852 		    gettext("nskernd: could not increase RLIMIT_NOFILE: %s\n"),
853 		    strerror(errno));
854 		fprintf(stderr,
855 		    gettext("nskernd: the maximum number of nsctl open "
856 		    "devices may be reduced\n"));
857 	}
858 
859 	/*
860 	 * Open /dev/nsctl and startup.
861 	 */
862 
863 	nsctl_fd = open(rdev, O_RDONLY);
864 	if (nsctl_fd < 0) {
865 		fprintf(stderr, gettext("nskernd: unable to open %s\n"), rdev);
866 		exit(1);
867 	}
868 
869 	bzero(&data, sizeof (data));
870 
871 	data.command = NSKERND_START;
872 	data.data1 = (uint64_t)cl_nodeid;
873 	run = 1;
874 
875 	startup = 1;
876 	while (run) {
877 		rc = ioctl(nsctl_fd, NSCIOC_NSKERND, &data);
878 		if (rc < 0) {
879 			/* try and do kernel cleanup and exit */
880 			if (shutdown()) {
881 				run = 0;
882 			} else {
883 				sigterm = 0;
884 			}
885 
886 			fprintf(stderr,
887 			    gettext("nskernd: NSCIOC_NSKERND failed: %s\n"),
888 			    strerror(errno));
889 			continue;
890 		} else if (sigterm) {
891 			/* SIGTERM received - terminate */
892 			if (data.command != NSKERND_START &&
893 			    (data.command != NSKERND_STOP ||
894 			    data.data1 != (uint64_t)1)) {
895 				/* need to do kernel cleanup */
896 				if (shutdown()) {
897 					run = 0;
898 				} else {
899 					sigterm = 0;
900 					data.command = NSKERND_START;
901 					data.data1 = (uint64_t)cl_nodeid;
902 				}
903 			} else {
904 				/* just quit */
905 				if (canshutdown()) {
906 					run = 0;
907 				} else {
908 					/* cannot shutdown - threads active */
909 					sigterm = 0;
910 					data.command = NSKERND_START;
911 					data.data1 = (uint64_t)cl_nodeid;
912 				}
913 			}
914 			continue;
915 		}
916 		if (startup) {
917 			char c = 0;
918 			(void) write(syncpipe[1], &c, 1);
919 			(void) close(syncpipe[1]);
920 			startup = 0;
921 		}
922 		switch (data.command) {
923 		case NSKERND_START:	/* (re)start completion */
924 			if (rc == 1) {
925 				fprintf(stderr,
926 				    gettext("nskernd: already started\n"));
927 				run = 0;
928 			} else if (rc == 2) {
929 				fprintf(stderr,
930 				    gettext("nskernd: stopped by kernel\n"));
931 				run = 0;
932 			}
933 			data.command = NSKERND_WAIT;
934 			break;
935 
936 		case NSKERND_STOP:	/* kernel telling daemon to stop */
937 			if (data.data1 != (uint64_t)1) {
938 				(void) shutdown();
939 				run = 0;
940 			}
941 			break;
942 
943 		case NSKERND_BSIZE:
944 			/*
945 			 * kernel requesting partsize
946 			 * data1 - size return
947 			 * data2 - raw_fd (entry)
948 			 *	 - partition number (return)
949 			 */
950 			partition = -1;
951 			get_bsize(data.data2, &data.data1,
952 			    &partition, data.char1);
953 			data.data2 = (uint64_t)partition;
954 			data.command = NSKERND_WAIT;
955 			break;
956 
957 		case NSKERND_NEWLWP:	/* kernel requesting a new LWP */
958 			newlwp(&data);
959 			data.command = NSKERND_WAIT;
960 			break;
961 
962 		case NSKERND_LOCK:  	/* kernel requesting lock */
963 			dolock(&data);
964 			data.command = NSKERND_WAIT;
965 			break;
966 
967 		case NSKERND_WAIT:	/* kernel retrying wait */
968 			/*
969 			 * the kernel thread can be woken by the dr config
970 			 * utilities (ie cfgadm) therefore we just reissue
971 			 * the wait.
972 			 */
973 			break;
974 
975 		case NSKERND_IIBITMAP:
976 			rc = log_iibmp_err(data.char1, (int)data.data1);
977 			data.data1 = (uint64_t)rc;
978 			data.command = NSKERND_WAIT;
979 			break;
980 
981 		default:
982 			fprintf(stderr,
983 				gettext("nskernd: unknown command %d"),
984 				data.command);
985 			data.command = NSKERND_WAIT;
986 			break;
987 		}
988 	}
989 
990 	(void) close(nsctl_fd);
991 
992 	return (rc);
993 }
994