xref: /titanic_50/usr/src/cmd/avs/nsctl/nskernd.c (revision 0f5cc0e1bee31c69c160a9cf7ffdff5fac4f8e6d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/resource.h>
29 #include <sys/priocntl.h>
30 #include <sys/rtpriocntl.h>
31 #include <sys/tspriocntl.h>
32 #include <sys/wait.h>
33 #include <sys/stat.h>
34 
35 #include <strings.h>
36 #include <thread.h>
37 #include <stdlib.h>
38 #include <signal.h>
39 #include <errno.h>
40 #include <stdio.h>
41 #include <fcntl.h>
42 #include <locale.h>
43 #include <unistd.h>
44 #include <syslog.h>
45 
46 #include <sys/nsctl/cfg.h>
47 #include <sys/nsctl/nsctl.h>
48 #include <sys/nsctl/nsc_ioctl.h>
49 #include <sys/nskernd.h>
50 #include <nsctl.h>
51 
52 #include <sys/mkdev.h>
53 #include <sys/nsctl/sv_efi.h>
54 
55 static const char *rdev = "/dev/nsctl";
56 
57 /*
58  * Define a minimal user stack size in bytes over and above the
59  * libthread THR_STACK_MIN minimum value.
60  *
61  * This stack size needs to be sufficient to run _newlwp() and then
62  * ioctl() down into the kernel.
63  */
64 #define	NSK_STACK_SIZE	512
65 
66 /*
67  * LWP scheduling control switches.
68  *
69  * allow_pri	- set to non-zero to enable priocntl() manipulations of
70  *		created LWPs.
71  * allow_rt	- set to non-zero to use the RT rather than the TS
72  *		scheduling class when manipulating the schduling
73  *		parameters for an LWP.  Only used if allow_pri is
74  *		non-zero.
75  */
76 static int allow_pri = 1;
77 static int allow_rt = 0;	/* disallow - bad interactions with timeout() */
78 
79 static int nsctl_fd = -1;
80 static int sigterm;
81 
82 static int nthreads;		/* number of threads in the kernel */
83 static int exiting;		/* shutdown in progress flag */
84 static mutex_t thr_mutex = DEFAULTMUTEX;
85 static mutex_t cfg_mutex = DEFAULTMUTEX;
86 
87 static int cl_nodeid = -1;
88 
89 static int display_msg = 0;
90 static int delay_time = 30;
91 
92 static void
93 usage(void)
94 {
95 	(void) fprintf(stderr, gettext("usage: nskernd\n"));
96 	exit(255);
97 }
98 
99 
100 static void
101 sighand(int sig)
102 {
103 	if (sig == SIGTERM) {
104 		sigterm++;
105 	}
106 }
107 
108 
109 /*
110  * Returns: 1 - can enter kernel; 0 - shutdown in progress, do not enter kernel
111  */
112 int
113 nthread_inc(void)
114 {
115 	(void) mutex_lock(&thr_mutex);
116 	if (exiting) {
117 		/* cannot enter kernel as nskernd is being shutdown - exit */
118 		(void) mutex_unlock(&thr_mutex);
119 		return (0);
120 	}
121 	nthreads++;
122 	(void) mutex_unlock(&thr_mutex);
123 	return (1);
124 }
125 
126 
127 void
128 nthread_dec(void)
129 {
130 	(void) mutex_lock(&thr_mutex);
131 	nthreads--;
132 	(void) mutex_unlock(&thr_mutex);
133 }
134 
135 
136 /*
137  * returns: 1 - can shutdown; 0 - unable to shutdown
138  */
139 int
140 canshutdown(void)
141 {
142 	int rc = 1;
143 	time_t	start_delay;
144 
145 	(void) mutex_lock(&thr_mutex);
146 	if (nthreads > 0) {
147 		if (display_msg) {
148 			(void) fprintf(stderr,
149 			    gettext("nskernd: unable to shutdown: "
150 			    "%d kernel threads in use\n"), nthreads);
151 		}
152 		start_delay = time(0);
153 		while (nthreads > 0 && (time(0) - start_delay) < delay_time) {
154 			(void) mutex_unlock(&thr_mutex);
155 			(void) sleep(1);
156 			(void) mutex_lock(&thr_mutex);
157 			(void) fprintf(stderr,
158 			    gettext("nskernd:   delay shutdown: "
159 			    "%d kernel threads in use\n"), nthreads);
160 		}
161 		if (nthreads > 0) {
162 			rc = 0;
163 		} else {
164 			exiting = 1;
165 		}
166 	} else {
167 		/* flag shutdown in progress */
168 		exiting = 1;
169 	}
170 	(void) mutex_unlock(&thr_mutex);
171 
172 	return (rc);
173 }
174 
175 
176 /*
177  * returns: 1 - shutdown successful; 0 - unable to shutdown
178  */
179 int
180 shutdown(void)
181 {
182 	struct nskernd data;
183 	int rc;
184 
185 	if (nsctl_fd < 0)
186 		return (1);
187 
188 	bzero(&data, sizeof (data));
189 	data.command = NSKERND_STOP;
190 
191 	if (!canshutdown()) {
192 		return (0);
193 	}
194 
195 	rc = ioctl(nsctl_fd, NSCIOC_NSKERND, &data);
196 	if (rc < 0) {
197 		if (errno != EINTR || !sigterm) {
198 			(void) fprintf(stderr,
199 			    gettext("nskernd: NSKERND_STOP failed\n"));
200 		}
201 	}
202 
203 	return (1);
204 }
205 
206 
207 /*
208  * First function run by a NSKERND_NEWLWP thread.
209  *
210  * Determines if it needs to change the scheduling priority of the LWP,
211  * and then calls back into the kernel.
212  */
213 static void *
214 _newlwp(void *arg)
215 {
216 	struct nskernd nsk;
217 	pcparms_t pcparms;
218 	pcinfo_t pcinfo;
219 
220 	/* copy arguments onto stack and free heap memory */
221 	bcopy(arg, &nsk, sizeof (nsk));
222 	free(arg);
223 
224 	if (nsk.data2 && allow_pri) {
225 		/* increase the scheduling priority of this LWP */
226 
227 		bzero(&pcinfo, sizeof (pcinfo));
228 		(void) strcpy(pcinfo.pc_clname, allow_rt ? "RT" : "TS");
229 
230 		if (priocntl(0, 0, PC_GETCID, (char *)&pcinfo) < 0) {
231 			(void) fprintf(stderr,
232 			    gettext(
233 			    "nskernd: priocntl(PC_GETCID) failed: %s\n"),
234 			    strerror(errno));
235 			goto pri_done;
236 		}
237 
238 		bzero(&pcparms, sizeof (pcparms));
239 		pcparms.pc_cid = pcinfo.pc_cid;
240 
241 		if (allow_rt) {
242 			((rtparms_t *)pcparms.pc_clparms)->rt_pri =
243 				(pri_t)0; /* minimum RT priority */
244 			((rtparms_t *)pcparms.pc_clparms)->rt_tqsecs =
245 				(uint_t)RT_TQDEF;
246 			((rtparms_t *)pcparms.pc_clparms)->rt_tqnsecs =
247 				RT_TQDEF;
248 		} else {
249 			((tsparms_t *)pcparms.pc_clparms)->ts_uprilim =
250 				((tsinfo_t *)&pcinfo.pc_clinfo)->ts_maxupri;
251 			((tsparms_t *)pcparms.pc_clparms)->ts_upri =
252 				((tsinfo_t *)&pcinfo.pc_clinfo)->ts_maxupri;
253 		}
254 
255 		if (priocntl(P_LWPID, P_MYID,
256 		    PC_SETPARMS, (char *)&pcparms) < 0) {
257 			(void) fprintf(stderr,
258 			    gettext(
259 			    "nskernd: priocntl(PC_SETPARMS) failed: %s\n"),
260 			    strerror(errno));
261 		}
262 	}
263 
264 pri_done:
265 	if (nthread_inc()) {
266 		(void) ioctl(nsctl_fd, NSCIOC_NSKERND, &nsk);
267 		nthread_dec();
268 	}
269 	return (NULL);
270 }
271 
272 
273 /*
274  * Start a new thread bound to an LWP.
275  *
276  * This is the user level side of nsc_create_process().
277  */
278 static void
279 newlwp(struct nskernd *req)
280 {
281 	struct nskernd *nskp;
282 	thread_t tid;
283 	int rc;
284 
285 	nskp = malloc(sizeof (*nskp));
286 	if (!nskp) {
287 #ifdef DEBUG
288 		(void) fprintf(stderr, gettext("nskernd: malloc(%d) failed\n"),
289 		    sizeof (*nskp));
290 #endif
291 		req->data1 = (uint64_t)ENOMEM;
292 		return;
293 	}
294 
295 	/* copy args for child */
296 	bcopy(req, nskp, sizeof (*nskp));
297 
298 	rc = thr_create(NULL, (THR_MIN_STACK + NSK_STACK_SIZE),
299 		_newlwp, nskp, THR_BOUND|THR_DETACHED, &tid);
300 
301 	if (rc != 0) {
302 		/* thr_create failed */
303 #ifdef DEBUG
304 		(void) fprintf(stderr,
305 		    gettext("nskernd: thr_create failed: %s\n"),
306 		    strerror(errno));
307 #endif
308 		req->data1 = (uint64_t)errno;
309 		free(nskp);
310 	} else {
311 		/* success - _newlwp() will free nskp */
312 		req->data1 = (uint64_t)0;
313 	}
314 }
315 
316 static int
317 log_iibmp_err(char *set, int flags)
318 {
319 	CFGFILE *cfg;
320 	char key[CFG_MAX_KEY];
321 	char buf[CFG_MAX_BUF];
322 	char newflags[CFG_MAX_BUF];
323 	char outbuf[CFG_MAX_BUF];
324 	char *mst, *shd, *bmp, *mode, *ovr, *cnode, *opt, *grp;
325 	int setno, found = 0;
326 	int setlen;
327 	int rc = 0;
328 	pid_t pid = -1;
329 
330 	if (set && *set) {
331 		setlen = strlen(set);
332 	} else {
333 		return (EINVAL);
334 	}
335 
336 	(void) mutex_lock(&cfg_mutex);
337 	cfg = cfg_open("");
338 	if (!cfg) {
339 		(void) mutex_unlock(&cfg_mutex);
340 		return (ENXIO);
341 	}
342 
343 	if (!cfg_lock(cfg, CFG_WRLOCK)) {
344 
345 		(void) mutex_unlock(&cfg_mutex);
346 		cfg_close(cfg);
347 
348 		pid = fork();
349 
350 		if (pid == -1) {
351 			(void) fprintf(stderr, gettext(
352 			    "nskernd: Error forking\n"));
353 			return (errno);
354 		} else if (pid > 0) {
355 			(void) fprintf(stdout, gettext(
356 			    "nskernd: Attempting deferred bitmap error\n"));
357 			return (0);
358 		}
359 
360 		(void) mutex_lock(&cfg_mutex);
361 		cfg = cfg_open("");
362 		if (!cfg) {
363 			(void) mutex_unlock(&cfg_mutex);
364 			(void) fprintf(stderr, gettext(
365 			    "nskernd: Failed cfg_open, deferred bitmap\n"));
366 			return (ENXIO);
367 		}
368 
369 		/* Sooner or later, this lock will be free */
370 		while (!cfg_lock(cfg, CFG_WRLOCK))
371 			(void) sleep(2);
372 	}
373 
374 	/* find the proper set number */
375 	for (setno = 1; !found; setno++) {
376 		(void) snprintf(key, CFG_MAX_KEY, "ii.set%d", setno);
377 		if (cfg_get_cstring(cfg, key, buf, CFG_MAX_BUF) < 0) {
378 			break;
379 		}
380 
381 		mst = strtok(buf, " ");
382 		shd = strtok(NULL, " ");
383 		if (strncmp(shd, set, setlen) == 0) {
384 			found = 1;
385 
386 			bmp = strtok(NULL, " ");
387 			mode = strtok(NULL, " ");
388 			ovr = strtok(NULL, " ");
389 			cnode = strtok(NULL, " ");
390 			opt = strtok(NULL, " ");
391 			grp = strtok(NULL, " ");
392 			break;
393 		}
394 	}
395 
396 	if (found) {
397 		/* were there flags in the options field already? */
398 		(void) snprintf(newflags, CFG_MAX_BUF, "%s=0x%x",
399 		    NSKERN_II_BMP_OPTION, flags);
400 		if (opt && strcmp(opt, "-") != 0) {
401 			bzero(newflags, CFG_MAX_BUF);
402 			opt = strtok(opt, ";");
403 			while (opt) {
404 				if (strncmp(opt, NSKERN_II_BMP_OPTION,
405 				    strlen(NSKERN_II_BMP_OPTION)) != 0) {
406 					(void) strcat(newflags, ";");
407 					(void) strcat(newflags, opt);
408 				}
409 			}
410 		}
411 		(void) snprintf(key, CFG_MAX_KEY, "ii.set%d", setno);
412 		(void) snprintf(outbuf, CFG_MAX_BUF, "%s %s %s %s %s %s %s %s",
413 		    mst, shd, bmp, mode, ovr, cnode, newflags, grp);
414 		if (cfg_put_cstring(cfg, key, outbuf, CFG_MAX_BUF) < 0) {
415 			(void) printf("Failed to put [%s]\n", outbuf);
416 			rc = ENXIO;
417 		} else {
418 			(void) cfg_commit(cfg);
419 			rc = 0;
420 		}
421 	} else {
422 		(void) fprintf(stderr, gettext(
423 		    "nskernd: Failed deferred bitmap [%s]\n"), set);
424 		rc = EINVAL;
425 	}
426 	cfg_unlock(cfg);
427 	cfg_close(cfg);
428 	(void) mutex_unlock(&cfg_mutex);
429 
430 	/*
431 	 * if we are the fork'ed client, just exit, if parent just return
432 	 */
433 	if (pid == 0) {
434 		exit(rc);
435 		/*NOTREACHED*/
436 	} else {
437 		return (rc);
438 	}
439 }
440 
441 /*
442  * First function run by a NSKERND_LOCK thread.
443  *
444  * Opens dscfg and locks it,
445  * and then calls back into the kernel.
446  *
447  * Incoming:
448  *	data1 is the kernel address of the sync structure.
449  *	data2 is read(0)/write(1) lock mode.
450  *
451  * Returns:
452  *	data1 as incoming.
453  *	data2 errno.
454  */
455 static void *
456 _dolock(void *arg)
457 {
458 	struct nskernd nsk;
459 	CFGFILE *cfg;
460 	int locked;
461 	int mode;
462 	int rc = 0;
463 
464 	/* copy arguments onto stack and free heap memory */
465 	bcopy(arg, &nsk, sizeof (nsk));
466 	free(arg);
467 
468 	(void) mutex_lock(&cfg_mutex);
469 	cfg = cfg_open("");
470 	if (cfg == NULL) {
471 #ifdef DEBUG
472 		(void) fprintf(stderr,
473 		    gettext("nskernd: cfg_open failed: %s\n"),
474 		    strerror(errno));
475 #endif
476 		rc = ENXIO;
477 	}
478 
479 	if (nsk.data2 == 0) {
480 		mode = CFG_RDLOCK;
481 	} else {
482 		mode = CFG_WRLOCK;
483 	}
484 
485 	locked = 0;
486 	if (rc == 0) {
487 		if (cfg_lock(cfg, mode)) {
488 			locked = 1;
489 		} else {
490 #ifdef DEBUG
491 			(void) fprintf(stderr,
492 			    gettext("nskernd: cfg_lock failed: %s\n"),
493 			    strerror(errno));
494 #endif
495 			rc = EINVAL;
496 		}
497 	}
498 
499 	/* return to kernel */
500 
501 	nsk.data2 = (uint64_t)rc;
502 	if (nthread_inc()) {
503 		(void) ioctl(nsctl_fd, NSCIOC_NSKERND, &nsk);
504 		nthread_dec();
505 	}
506 
507 	/* cleanup */
508 
509 	if (locked) {
510 		cfg_unlock(cfg);
511 		locked = 0;
512 	}
513 
514 	if (cfg != NULL) {
515 		cfg_close(cfg);
516 		cfg = NULL;
517 	}
518 	(void) mutex_unlock(&cfg_mutex);
519 
520 	return (NULL);
521 }
522 
523 
524 /*
525  * Inter-node lock thread.
526  *
527  * This is the user level side of nsc_rmlock().
528  */
529 static void
530 dolock(struct nskernd *req)
531 {
532 	struct nskernd *nskp;
533 	thread_t tid;
534 	int rc;
535 
536 	/* create a new thread to do the lock and return to kernel */
537 
538 	nskp = malloc(sizeof (*nskp));
539 	if (!nskp) {
540 #ifdef DEBUG
541 		(void) fprintf(stderr,
542 		    gettext("nskernd:dolock: malloc(%d) failed\n"),
543 		    sizeof (*nskp));
544 #endif
545 		req->data1 = (uint64_t)ENOMEM;
546 		return;
547 	}
548 
549 	/* copy args for child */
550 	bcopy(req, nskp, sizeof (*nskp));
551 
552 	rc = thr_create(NULL, (THR_MIN_STACK + NSK_STACK_SIZE),
553 	    _dolock, nskp, THR_BOUND|THR_DETACHED, &tid);
554 
555 	if (rc != 0) {
556 		/* thr_create failed */
557 #ifdef DEBUG
558 		(void) fprintf(stderr,
559 		    gettext("nskernd: thr_create failed: %s\n"),
560 		    strerror(errno));
561 #endif
562 		req->data1 = (uint64_t)errno;
563 		free(nskp);
564 	} else {
565 		/* success - _dolock() will free nskp */
566 		req->data1 = (uint64_t)0;
567 	}
568 }
569 
570 
571 /*
572  * Convenience code for engineering test of multi-terabyte volumes.
573  *
574  * zvol (part of zfs) does not support DKIOCPARTITION but does use EFI
575  * labels.  This code allocates a simple efi label structure and ioctls
576  * to extract the size of a zvol.  It only handles the minimal EFI ioctl
577  * implementation in zvol.
578  */
579 
580 static void
581 zvol_bsize(char *path, uint64_t *size, const int pnum)
582 {
583 	struct stat64 stb1, stb2;
584 	struct dk_minfo dkm;
585 	int fd = -1;
586 	int rc;
587 
588 	if (cl_nodeid || pnum != 0)
589 		return;
590 
591 	if ((fd = open(path, O_RDONLY)) < 0) {
592 		return;
593 	}
594 
595 	if (stat64("/devices/pseudo/zfs@0:zfs", &stb1) != 0 ||
596 	    fstat64(fd, &stb2) != 0 ||
597 	    !S_ISCHR(stb1.st_mode) ||
598 	    !S_ISCHR(stb2.st_mode) ||
599 	    major(stb1.st_rdev) != major(stb2.st_rdev)) {
600 		(void) close(fd);
601 		return;
602 	}
603 
604 	rc = ioctl(fd, DKIOCGMEDIAINFO, (void *)&dkm);
605 	if (rc >= 0) {
606 		*size = LE_64(dkm.dki_capacity) *
607 			(dkm.dki_lbsize) / 512;
608 	}
609 
610 	(void) close(fd);
611 }
612 
613 /* ARGSUSED */
614 static void
615 get_bsize(uint64_t raw_fd, uint64_t *size, int *partitionp, char *path)
616 {
617 	struct nscioc_bsize bsize;
618 #ifdef DKIOCPARTITION
619 	struct partition64 p64;
620 #endif
621 	struct dk_cinfo dki_info;
622 	struct vtoc vtoc;
623 	int fd;
624 
625 	*partitionp = -1;
626 	*size = (uint64_t)0;
627 
628 	dki_info.dki_partition = (ushort_t)-1;
629 	bsize.dki_info = (uint64_t)(unsigned long)&dki_info;
630 	bsize.vtoc = (uint64_t)(unsigned long)&vtoc;
631 	bsize.raw_fd = raw_fd;
632 	bsize.efi = 0;
633 
634 	fd = open(rdev, O_RDONLY);
635 	if (fd < 0)
636 		return;
637 
638 	if (ioctl(fd, NSCIOC_BSIZE, &bsize) < 0) {
639 		if (dki_info.dki_partition != (ushort_t)-1) {
640 			/* assume part# is ok and just the size failed */
641 			*partitionp = (int)dki_info.dki_partition;
642 
643 #ifdef DKIOCPARTITION
644 			/* see if this is an EFI label */
645 			bzero(&p64, sizeof (p64));
646 			p64.p_partno = (uint_t)*partitionp;
647 			if ((ioctl(fd, DKIOCPARTITION, &p64)) > 0) {
648 				*size = (uint64_t)p64.p_size;
649 			} else {
650 				bsize.p64 = (uint64_t)(unsigned long)&p64;
651 				bsize.efi = 1;
652 
653 				if (ioctl(fd, NSCIOC_BSIZE, &bsize) < 0) {
654 					/* see if this is a zvol */
655 					zvol_bsize(path, size, *partitionp);
656 				} else {
657 					*size = (uint64_t)p64.p_size;
658 				}
659 			}
660 #endif	/* DKIOCPARTITION */
661 		}
662 
663 		(void) close(fd);
664 		return;
665 	}
666 
667 	(void) close(fd);
668 
669 	*partitionp = (int)dki_info.dki_partition;
670 
671 	if (vtoc.v_sanity != VTOC_SANE)
672 		return;
673 
674 	if (vtoc.v_version != V_VERSION && vtoc.v_version != 0)
675 		return;
676 
677 	if (dki_info.dki_partition > V_NUMPAR)
678 		return;
679 
680 	*size = (uint64_t)vtoc.v_part[(int)dki_info.dki_partition].p_size;
681 }
682 
683 
684 static int
685 iscluster(void)
686 {
687 	/*
688 	 * Find out if we are running in a cluster
689 	 */
690 	cl_nodeid = cfg_iscluster();
691 	if (cl_nodeid > 0) {
692 		return (TRUE);
693 	} else if (cl_nodeid == 0) {
694 		return (FALSE);
695 	}
696 
697 	(void) fprintf(stderr, "%s\n",
698 	    gettext("nskernd: unable to ascertain environment"));
699 	exit(1);
700 	/* NOTREACHED */
701 }
702 
703 /*
704  * Runtime Solaris release checking - build release == runtime release
705  * is always considered success, so only keep entries in the map for
706  * the special cases.
707  */
708 static nsc_release_t nskernd_rel_map[] = {
709 /*	{ "5.10", "5.10" },			*/
710 	{ "5.11", "5.10" },
711 	{ NULL, NULL }
712 };
713 
714 
715 #ifdef lint
716 #define	main	nskernd_main
717 #endif
718 /* ARGSUSED1 */
719 int
720 main(int argc, char *argv[])
721 {
722 	const char *dir = "/";
723 	struct nskernd data;
724 	struct rlimit rl;
725 	int i, run, rc;
726 	int partition;
727 	char *reqd;
728 	int syncpipe[2];
729 	int startup;
730 
731 	(void) setlocale(LC_ALL, "");
732 	(void) textdomain("nskernd");
733 
734 	rc = nsc_check_release(BUILD_REV_STR, nskernd_rel_map, &reqd);
735 	if (rc < 0) {
736 		(void) fprintf(stderr,
737 		    gettext("nskernd: unable to determine the current "
738 		    "Solaris release: %s\n"), strerror(errno));
739 		exit(1);
740 	} else if (rc == FALSE) {
741 		(void) fprintf(stderr,
742 		    gettext("nskernd: incorrect Solaris release "
743 		    "(requires %s)\n"), reqd);
744 		exit(1);
745 	}
746 
747 	rc = 0;
748 
749 	if (argc != 1)
750 		usage();
751 
752 	/*
753 	 * Usage: <progname> [-g] [-d <seconds to delay>]
754 	 */
755 	while ((i = getopt(argc, argv, "gd:")) != EOF) {
756 		switch (i) {
757 			case 'g':
758 				display_msg = 1;
759 				break;
760 			case 'd':
761 				delay_time = atoi(optarg);
762 				if (delay_time <= 0) {
763 					delay_time = 30;
764 				}
765 				break;
766 			default:
767 				syslog(LOG_ERR,
768 				"Usage: nskernd [-g] [-d <seconds to delay>]");
769 				exit(1);
770 				break;
771 		}
772 	}
773 
774 	if (chroot(dir) < 0) {
775 		(void) fprintf(stderr, gettext("nskernd: chroot failed: %s\n"),
776 		    strerror(errno));
777 		exit(1);
778 	}
779 
780 	if (chdir(dir) < 0) {
781 		(void) fprintf(stderr, gettext("nskernd: chdir failed: %s\n"),
782 		    strerror(errno));
783 		exit(1);
784 	}
785 
786 	/*
787 	 * Determine if we are in a Sun Cluster or not, before fork'ing
788 	 */
789 	(void) iscluster();
790 
791 	/*
792 	 * create a pipe to synchronise the parent with the
793 	 * child just before it enters its service loop.
794 	 */
795 	if (pipe(syncpipe) < 0) {
796 		(void) fprintf(stderr,
797 		    gettext("nskernd: cannot create pipe: %s\n"),
798 		    strerror(errno));
799 		exit(1);
800 	}
801 	/*
802 	 * Fork off a child that becomes the daemon.
803 	 */
804 
805 	if ((rc = fork()) > 0) {
806 		char c;
807 		int n;
808 		(void) close(syncpipe[1]);
809 		/*
810 		 * wait for the close of the pipe.
811 		 * If we get a char back, indicates good
812 		 * status from child, so exit 0.
813 		 * If we get a zero length read, then the
814 		 * child has failed, so we do too.
815 		 */
816 		n = read(syncpipe[0], &c, 1);
817 		exit((n <= 0) ? 1 : 0);
818 	} else if (rc < 0) {
819 		(void) fprintf(stderr, gettext("nskernd: cannot fork: %s\n"),
820 		    strerror(errno));
821 		exit(1);
822 	}
823 
824 	/*
825 	 * In child - become daemon.
826 	 */
827 
828 	/* use closefrom(3C) from PSARC/2000/193 when possible */
829 	for (i = 0; i < syncpipe[1]; i++) {
830 		(void) close(i);
831 	}
832 	closefrom(syncpipe[1] + 1);
833 
834 	(void) open("/dev/console", O_WRONLY|O_APPEND);
835 	(void) dup(0);
836 	(void) dup(0);
837 	(void) close(0);
838 
839 	(void) setpgrp();
840 
841 	/*
842 	 * Ignore all signals apart from SIGTERM.
843 	 */
844 
845 	for (i = 1; i < _sys_nsig; i++)
846 		(void) sigset(i, SIG_IGN);
847 
848 	(void) sigset(SIGTERM, sighand);
849 
850 	/*
851 	 * Increase the number of fd's that can be open.
852 	 */
853 
854 	rl.rlim_cur = RLIM_INFINITY;
855 	rl.rlim_max = RLIM_INFINITY;
856 	if (setrlimit(RLIMIT_NOFILE, &rl) < 0) {
857 		(void) fprintf(stderr,
858 		    gettext("nskernd: could not increase RLIMIT_NOFILE: %s\n"),
859 		    strerror(errno));
860 		(void) fprintf(stderr,
861 		    gettext("nskernd: the maximum number of nsctl open "
862 		    "devices may be reduced\n"));
863 	}
864 
865 	/*
866 	 * Open /dev/nsctl and startup.
867 	 */
868 
869 	nsctl_fd = open(rdev, O_RDONLY);
870 	if (nsctl_fd < 0) {
871 		(void) fprintf(stderr, gettext("nskernd: unable to open %s\n"),
872 		    rdev);
873 		exit(1);
874 	}
875 
876 	bzero(&data, sizeof (data));
877 
878 	data.command = NSKERND_START;
879 	data.data1 = (uint64_t)cl_nodeid;
880 	run = 1;
881 
882 	startup = 1;
883 	while (run) {
884 		rc = ioctl(nsctl_fd, NSCIOC_NSKERND, &data);
885 		if (rc < 0) {
886 			/* try and do kernel cleanup and exit */
887 			if (shutdown()) {
888 				run = 0;
889 			} else {
890 				sigterm = 0;
891 			}
892 
893 			(void) fprintf(stderr,
894 			    gettext("nskernd: NSCIOC_NSKERND failed: %s\n"),
895 			    strerror(errno));
896 			continue;
897 		} else if (sigterm) {
898 			/* SIGTERM received - terminate */
899 			if (data.command != NSKERND_START &&
900 			    (data.command != NSKERND_STOP ||
901 			    data.data1 != (uint64_t)1)) {
902 				/* need to do kernel cleanup */
903 				if (shutdown()) {
904 					run = 0;
905 				} else {
906 					sigterm = 0;
907 					data.command = NSKERND_START;
908 					data.data1 = (uint64_t)cl_nodeid;
909 				}
910 			} else {
911 				/* just quit */
912 				if (canshutdown()) {
913 					run = 0;
914 				} else {
915 					/* cannot shutdown - threads active */
916 					sigterm = 0;
917 					data.command = NSKERND_START;
918 					data.data1 = (uint64_t)cl_nodeid;
919 				}
920 			}
921 			continue;
922 		}
923 		if (startup) {
924 			char c = 0;
925 			(void) write(syncpipe[1], &c, 1);
926 			(void) close(syncpipe[1]);
927 			startup = 0;
928 		}
929 		switch (data.command) {
930 		case NSKERND_START:	/* (re)start completion */
931 			if (rc == 1) {
932 				(void) fprintf(stderr,
933 				    gettext("nskernd: already started\n"));
934 				run = 0;
935 			} else if (rc == 2) {
936 				(void) fprintf(stderr,
937 				    gettext("nskernd: stopped by kernel\n"));
938 				run = 0;
939 			}
940 			data.command = NSKERND_WAIT;
941 			break;
942 
943 		case NSKERND_STOP:	/* kernel telling daemon to stop */
944 			if (data.data1 != (uint64_t)1) {
945 				(void) shutdown();
946 				run = 0;
947 			}
948 			break;
949 
950 		case NSKERND_BSIZE:
951 			/*
952 			 * kernel requesting partsize
953 			 * data1 - size return
954 			 * data2 - raw_fd (entry)
955 			 *	 - partition number (return)
956 			 */
957 			partition = -1;
958 			get_bsize(data.data2, &data.data1,
959 			    &partition, data.char1);
960 			data.data2 = (uint64_t)partition;
961 			data.command = NSKERND_WAIT;
962 			break;
963 
964 		case NSKERND_NEWLWP:	/* kernel requesting a new LWP */
965 			newlwp(&data);
966 			data.command = NSKERND_WAIT;
967 			break;
968 
969 		case NSKERND_LOCK:  	/* kernel requesting lock */
970 			dolock(&data);
971 			data.command = NSKERND_WAIT;
972 			break;
973 
974 		case NSKERND_WAIT:	/* kernel retrying wait */
975 			/*
976 			 * the kernel thread can be woken by the dr config
977 			 * utilities (ie cfgadm) therefore we just reissue
978 			 * the wait.
979 			 */
980 			break;
981 
982 		case NSKERND_IIBITMAP:
983 			rc = log_iibmp_err(data.char1, (int)data.data1);
984 			data.data1 = (uint64_t)rc;
985 			data.command = NSKERND_WAIT;
986 			break;
987 
988 		default:
989 			(void) fprintf(stderr,
990 				gettext("nskernd: unknown command %d"),
991 				data.command);
992 			data.command = NSKERND_WAIT;
993 			break;
994 		}
995 	}
996 
997 	(void) close(nsctl_fd);
998 
999 	return (rc);
1000 }
1001