1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <sys/types.h>
28 #include <sys/resource.h>
29 #include <sys/priocntl.h>
30 #include <sys/rtpriocntl.h>
31 #include <sys/tspriocntl.h>
32 #include <sys/wait.h>
33 #include <sys/stat.h>
34
35 #include <strings.h>
36 #include <thread.h>
37 #include <stdlib.h>
38 #include <signal.h>
39 #include <errno.h>
40 #include <stdio.h>
41 #include <fcntl.h>
42 #include <locale.h>
43 #include <unistd.h>
44 #include <syslog.h>
45
46 #include <sys/nsctl/cfg.h>
47 #include <sys/nsctl/nsctl.h>
48 #include <sys/nsctl/nsc_ioctl.h>
49 #include <sys/nskernd.h>
50 #include <nsctl.h>
51
52 #include <sys/mkdev.h>
53 #include <sys/nsctl/sv_efi.h>
54
55 static const char *rdev = "/dev/nsctl";
56
57 /*
58 * Define a minimal user stack size in bytes over and above the
59 * libthread THR_STACK_MIN minimum value.
60 *
61 * This stack size needs to be sufficient to run _newlwp() and then
62 * ioctl() down into the kernel.
63 */
64 #define NSK_STACK_SIZE 512
65
66 /*
67 * LWP scheduling control switches.
68 *
69 * allow_pri - set to non-zero to enable priocntl() manipulations of
70 * created LWPs.
71 * allow_rt - set to non-zero to use the RT rather than the TS
72 * scheduling class when manipulating the schduling
73 * parameters for an LWP. Only used if allow_pri is
74 * non-zero.
75 */
76 static int allow_pri = 1;
77 static int allow_rt = 0; /* disallow - bad interactions with timeout() */
78
79 static int nsctl_fd = -1;
80 static int sigterm;
81
82 static int nthreads; /* number of threads in the kernel */
83 static int exiting; /* shutdown in progress flag */
84 static mutex_t thr_mutex = DEFAULTMUTEX;
85 static mutex_t cfg_mutex = DEFAULTMUTEX;
86
87 static int cl_nodeid = -1;
88
89 static int display_msg = 0;
90 static int delay_time = 30;
91
92 static void
usage(void)93 usage(void)
94 {
95 (void) fprintf(stderr, gettext("usage: nskernd\n"));
96 exit(255);
97 }
98
99
100 static void
sighand(int sig)101 sighand(int sig)
102 {
103 if (sig == SIGTERM) {
104 sigterm++;
105 }
106 }
107
108
109 /*
110 * Returns: 1 - can enter kernel; 0 - shutdown in progress, do not enter kernel
111 */
112 int
nthread_inc(void)113 nthread_inc(void)
114 {
115 (void) mutex_lock(&thr_mutex);
116 if (exiting) {
117 /* cannot enter kernel as nskernd is being shutdown - exit */
118 (void) mutex_unlock(&thr_mutex);
119 return (0);
120 }
121 nthreads++;
122 (void) mutex_unlock(&thr_mutex);
123 return (1);
124 }
125
126
127 void
nthread_dec(void)128 nthread_dec(void)
129 {
130 (void) mutex_lock(&thr_mutex);
131 nthreads--;
132 (void) mutex_unlock(&thr_mutex);
133 }
134
135
136 /*
137 * returns: 1 - can shutdown; 0 - unable to shutdown
138 */
139 int
canshutdown(void)140 canshutdown(void)
141 {
142 int rc = 1;
143 time_t start_delay;
144
145 (void) mutex_lock(&thr_mutex);
146 if (nthreads > 0) {
147 if (display_msg) {
148 (void) fprintf(stderr,
149 gettext("nskernd: unable to shutdown: "
150 "%d kernel threads in use\n"), nthreads);
151 }
152 start_delay = time(0);
153 while (nthreads > 0 && (time(0) - start_delay) < delay_time) {
154 (void) mutex_unlock(&thr_mutex);
155 (void) sleep(1);
156 (void) mutex_lock(&thr_mutex);
157 (void) fprintf(stderr,
158 gettext("nskernd: delay shutdown: "
159 "%d kernel threads in use\n"), nthreads);
160 }
161 if (nthreads > 0) {
162 rc = 0;
163 } else {
164 exiting = 1;
165 }
166 } else {
167 /* flag shutdown in progress */
168 exiting = 1;
169 }
170 (void) mutex_unlock(&thr_mutex);
171
172 return (rc);
173 }
174
175
176 /*
177 * returns: 1 - shutdown successful; 0 - unable to shutdown
178 */
179 int
shutdown(void)180 shutdown(void)
181 {
182 struct nskernd data;
183 int rc;
184
185 if (nsctl_fd < 0)
186 return (1);
187
188 bzero(&data, sizeof (data));
189 data.command = NSKERND_STOP;
190
191 if (!canshutdown()) {
192 return (0);
193 }
194
195 rc = ioctl(nsctl_fd, NSCIOC_NSKERND, &data);
196 if (rc < 0) {
197 if (errno != EINTR || !sigterm) {
198 (void) fprintf(stderr,
199 gettext("nskernd: NSKERND_STOP failed\n"));
200 }
201 }
202
203 return (1);
204 }
205
206
207 /*
208 * First function run by a NSKERND_NEWLWP thread.
209 *
210 * Determines if it needs to change the scheduling priority of the LWP,
211 * and then calls back into the kernel.
212 */
213 static void *
_newlwp(void * arg)214 _newlwp(void *arg)
215 {
216 struct nskernd nsk;
217 pcparms_t pcparms;
218 pcinfo_t pcinfo;
219
220 /* copy arguments onto stack and free heap memory */
221 bcopy(arg, &nsk, sizeof (nsk));
222 free(arg);
223
224 if (nsk.data2 && allow_pri) {
225 /* increase the scheduling priority of this LWP */
226
227 bzero(&pcinfo, sizeof (pcinfo));
228 (void) strcpy(pcinfo.pc_clname, allow_rt ? "RT" : "TS");
229
230 if (priocntl(0, 0, PC_GETCID, (char *)&pcinfo) < 0) {
231 (void) fprintf(stderr,
232 gettext(
233 "nskernd: priocntl(PC_GETCID) failed: %s\n"),
234 strerror(errno));
235 goto pri_done;
236 }
237
238 bzero(&pcparms, sizeof (pcparms));
239 pcparms.pc_cid = pcinfo.pc_cid;
240
241 if (allow_rt) {
242 ((rtparms_t *)pcparms.pc_clparms)->rt_pri =
243 (pri_t)0; /* minimum RT priority */
244 ((rtparms_t *)pcparms.pc_clparms)->rt_tqsecs =
245 (uint_t)RT_TQDEF;
246 ((rtparms_t *)pcparms.pc_clparms)->rt_tqnsecs =
247 RT_TQDEF;
248 } else {
249 ((tsparms_t *)pcparms.pc_clparms)->ts_uprilim =
250 ((tsinfo_t *)&pcinfo.pc_clinfo)->ts_maxupri;
251 ((tsparms_t *)pcparms.pc_clparms)->ts_upri =
252 ((tsinfo_t *)&pcinfo.pc_clinfo)->ts_maxupri;
253 }
254
255 if (priocntl(P_LWPID, P_MYID,
256 PC_SETPARMS, (char *)&pcparms) < 0) {
257 (void) fprintf(stderr,
258 gettext(
259 "nskernd: priocntl(PC_SETPARMS) failed: %s\n"),
260 strerror(errno));
261 }
262 }
263
264 pri_done:
265 if (nthread_inc()) {
266 (void) ioctl(nsctl_fd, NSCIOC_NSKERND, &nsk);
267 nthread_dec();
268 }
269 return (NULL);
270 }
271
272
273 /*
274 * Start a new thread bound to an LWP.
275 *
276 * This is the user level side of nsc_create_process().
277 */
278 static void
newlwp(struct nskernd * req)279 newlwp(struct nskernd *req)
280 {
281 struct nskernd *nskp;
282 thread_t tid;
283 int rc;
284
285 nskp = malloc(sizeof (*nskp));
286 if (!nskp) {
287 #ifdef DEBUG
288 (void) fprintf(stderr, gettext("nskernd: malloc(%d) failed\n"),
289 sizeof (*nskp));
290 #endif
291 req->data1 = (uint64_t)ENOMEM;
292 return;
293 }
294
295 /* copy args for child */
296 bcopy(req, nskp, sizeof (*nskp));
297
298 rc = thr_create(NULL, (THR_MIN_STACK + NSK_STACK_SIZE),
299 _newlwp, nskp, THR_BOUND|THR_DETACHED, &tid);
300
301 if (rc != 0) {
302 /* thr_create failed */
303 #ifdef DEBUG
304 (void) fprintf(stderr,
305 gettext("nskernd: thr_create failed: %s\n"),
306 strerror(errno));
307 #endif
308 req->data1 = (uint64_t)errno;
309 free(nskp);
310 } else {
311 /* success - _newlwp() will free nskp */
312 req->data1 = (uint64_t)0;
313 }
314 }
315
316 static int
log_iibmp_err(char * set,int flags)317 log_iibmp_err(char *set, int flags)
318 {
319 CFGFILE *cfg;
320 char key[CFG_MAX_KEY];
321 char buf[CFG_MAX_BUF];
322 char newflags[CFG_MAX_BUF];
323 char outbuf[CFG_MAX_BUF];
324 char *mst, *shd, *bmp, *mode, *ovr, *cnode, *opt, *grp;
325 int setno, found = 0;
326 int setlen;
327 int rc = 0;
328 pid_t pid = -1;
329
330 if (set && *set) {
331 setlen = strlen(set);
332 } else {
333 return (EINVAL);
334 }
335
336 (void) mutex_lock(&cfg_mutex);
337 cfg = cfg_open("");
338 if (!cfg) {
339 (void) mutex_unlock(&cfg_mutex);
340 return (ENXIO);
341 }
342
343 if (!cfg_lock(cfg, CFG_WRLOCK)) {
344
345 (void) mutex_unlock(&cfg_mutex);
346 cfg_close(cfg);
347
348 pid = fork();
349
350 if (pid == -1) {
351 (void) fprintf(stderr, gettext(
352 "nskernd: Error forking\n"));
353 return (errno);
354 } else if (pid > 0) {
355 (void) fprintf(stdout, gettext(
356 "nskernd: Attempting deferred bitmap error\n"));
357 return (0);
358 }
359
360 (void) mutex_lock(&cfg_mutex);
361 cfg = cfg_open("");
362 if (!cfg) {
363 (void) mutex_unlock(&cfg_mutex);
364 (void) fprintf(stderr, gettext(
365 "nskernd: Failed cfg_open, deferred bitmap\n"));
366 return (ENXIO);
367 }
368
369 /* Sooner or later, this lock will be free */
370 while (!cfg_lock(cfg, CFG_WRLOCK))
371 (void) sleep(2);
372 }
373
374 /* find the proper set number */
375 for (setno = 1; !found; setno++) {
376 (void) snprintf(key, CFG_MAX_KEY, "ii.set%d", setno);
377 if (cfg_get_cstring(cfg, key, buf, CFG_MAX_BUF) < 0) {
378 break;
379 }
380
381 mst = strtok(buf, " ");
382 shd = strtok(NULL, " ");
383 if (strncmp(shd, set, setlen) == 0) {
384 found = 1;
385
386 bmp = strtok(NULL, " ");
387 mode = strtok(NULL, " ");
388 ovr = strtok(NULL, " ");
389 cnode = strtok(NULL, " ");
390 opt = strtok(NULL, " ");
391 grp = strtok(NULL, " ");
392 break;
393 }
394 }
395
396 if (found) {
397 /* were there flags in the options field already? */
398 (void) snprintf(newflags, CFG_MAX_BUF, "%s=0x%x",
399 NSKERN_II_BMP_OPTION, flags);
400 if (opt && strcmp(opt, "-") != 0) {
401 bzero(newflags, CFG_MAX_BUF);
402 opt = strtok(opt, ";");
403 while (opt) {
404 if (strncmp(opt, NSKERN_II_BMP_OPTION,
405 strlen(NSKERN_II_BMP_OPTION)) != 0) {
406 (void) strcat(newflags, ";");
407 (void) strcat(newflags, opt);
408 }
409 }
410 }
411 (void) snprintf(key, CFG_MAX_KEY, "ii.set%d", setno);
412 (void) snprintf(outbuf, CFG_MAX_BUF, "%s %s %s %s %s %s %s %s",
413 mst, shd, bmp, mode, ovr, cnode, newflags, grp);
414 if (cfg_put_cstring(cfg, key, outbuf, CFG_MAX_BUF) < 0) {
415 (void) printf("Failed to put [%s]\n", outbuf);
416 rc = ENXIO;
417 } else {
418 (void) cfg_commit(cfg);
419 rc = 0;
420 }
421 } else {
422 (void) fprintf(stderr, gettext(
423 "nskernd: Failed deferred bitmap [%s]\n"), set);
424 rc = EINVAL;
425 }
426 cfg_unlock(cfg);
427 cfg_close(cfg);
428 (void) mutex_unlock(&cfg_mutex);
429
430 /*
431 * if we are the fork'ed client, just exit, if parent just return
432 */
433 if (pid == 0) {
434 exit(rc);
435 /*NOTREACHED*/
436 } else {
437 return (rc);
438 }
439 }
440
441 /*
442 * First function run by a NSKERND_LOCK thread.
443 *
444 * Opens dscfg and locks it,
445 * and then calls back into the kernel.
446 *
447 * Incoming:
448 * data1 is the kernel address of the sync structure.
449 * data2 is read(0)/write(1) lock mode.
450 *
451 * Returns:
452 * data1 as incoming.
453 * data2 errno.
454 */
455 static void *
_dolock(void * arg)456 _dolock(void *arg)
457 {
458 struct nskernd nsk;
459 CFGFILE *cfg;
460 int locked;
461 int mode;
462 int rc = 0;
463
464 /* copy arguments onto stack and free heap memory */
465 bcopy(arg, &nsk, sizeof (nsk));
466 free(arg);
467
468 (void) mutex_lock(&cfg_mutex);
469 cfg = cfg_open("");
470 if (cfg == NULL) {
471 #ifdef DEBUG
472 (void) fprintf(stderr,
473 gettext("nskernd: cfg_open failed: %s\n"),
474 strerror(errno));
475 #endif
476 rc = ENXIO;
477 }
478
479 if (nsk.data2 == 0) {
480 mode = CFG_RDLOCK;
481 } else {
482 mode = CFG_WRLOCK;
483 }
484
485 locked = 0;
486 if (rc == 0) {
487 if (cfg_lock(cfg, mode)) {
488 locked = 1;
489 } else {
490 #ifdef DEBUG
491 (void) fprintf(stderr,
492 gettext("nskernd: cfg_lock failed: %s\n"),
493 strerror(errno));
494 #endif
495 rc = EINVAL;
496 }
497 }
498
499 /* return to kernel */
500
501 nsk.data2 = (uint64_t)rc;
502 if (nthread_inc()) {
503 (void) ioctl(nsctl_fd, NSCIOC_NSKERND, &nsk);
504 nthread_dec();
505 }
506
507 /* cleanup */
508
509 if (locked) {
510 cfg_unlock(cfg);
511 locked = 0;
512 }
513
514 if (cfg != NULL) {
515 cfg_close(cfg);
516 cfg = NULL;
517 }
518 (void) mutex_unlock(&cfg_mutex);
519
520 return (NULL);
521 }
522
523
524 /*
525 * Inter-node lock thread.
526 *
527 * This is the user level side of nsc_rmlock().
528 */
529 static void
dolock(struct nskernd * req)530 dolock(struct nskernd *req)
531 {
532 struct nskernd *nskp;
533 thread_t tid;
534 int rc;
535
536 /* create a new thread to do the lock and return to kernel */
537
538 nskp = malloc(sizeof (*nskp));
539 if (!nskp) {
540 #ifdef DEBUG
541 (void) fprintf(stderr,
542 gettext("nskernd:dolock: malloc(%d) failed\n"),
543 sizeof (*nskp));
544 #endif
545 req->data1 = (uint64_t)ENOMEM;
546 return;
547 }
548
549 /* copy args for child */
550 bcopy(req, nskp, sizeof (*nskp));
551
552 rc = thr_create(NULL, (THR_MIN_STACK + NSK_STACK_SIZE),
553 _dolock, nskp, THR_BOUND|THR_DETACHED, &tid);
554
555 if (rc != 0) {
556 /* thr_create failed */
557 #ifdef DEBUG
558 (void) fprintf(stderr,
559 gettext("nskernd: thr_create failed: %s\n"),
560 strerror(errno));
561 #endif
562 req->data1 = (uint64_t)errno;
563 free(nskp);
564 } else {
565 /* success - _dolock() will free nskp */
566 req->data1 = (uint64_t)0;
567 }
568 }
569
570
571 /*
572 * Convenience code for engineering test of multi-terabyte volumes.
573 *
574 * zvol (part of zfs) does not support DKIOCPARTITION but does use EFI
575 * labels. This code allocates a simple efi label structure and ioctls
576 * to extract the size of a zvol. It only handles the minimal EFI ioctl
577 * implementation in zvol.
578 */
579
580 static void
zvol_bsize(char * path,uint64_t * size,const int pnum)581 zvol_bsize(char *path, uint64_t *size, const int pnum)
582 {
583 struct stat64 stb1, stb2;
584 struct dk_minfo dkm;
585 int fd = -1;
586 int rc;
587
588 if (cl_nodeid || pnum != 0)
589 return;
590
591 if ((fd = open(path, O_RDONLY)) < 0) {
592 return;
593 }
594
595 if (stat64("/devices/pseudo/zfs@0:zfs", &stb1) != 0 ||
596 fstat64(fd, &stb2) != 0 ||
597 !S_ISCHR(stb1.st_mode) ||
598 !S_ISCHR(stb2.st_mode) ||
599 major(stb1.st_rdev) != major(stb2.st_rdev)) {
600 (void) close(fd);
601 return;
602 }
603
604 rc = ioctl(fd, DKIOCGMEDIAINFO, (void *)&dkm);
605 if (rc >= 0) {
606 *size = LE_64(dkm.dki_capacity) *
607 (dkm.dki_lbsize) / 512;
608 }
609
610 (void) close(fd);
611 }
612
613 /* ARGSUSED */
614 static void
get_bsize(uint64_t raw_fd,uint64_t * size,int * partitionp,char * path)615 get_bsize(uint64_t raw_fd, uint64_t *size, int *partitionp, char *path)
616 {
617 struct nscioc_bsize bsize;
618 #ifdef DKIOCPARTITION
619 struct partition64 p64;
620 #endif
621 struct dk_cinfo dki_info;
622 struct vtoc vtoc;
623 int fd;
624
625 *partitionp = -1;
626 *size = (uint64_t)0;
627
628 dki_info.dki_partition = (ushort_t)-1;
629 bsize.dki_info = (uint64_t)(unsigned long)&dki_info;
630 bsize.vtoc = (uint64_t)(unsigned long)&vtoc;
631 bsize.raw_fd = raw_fd;
632 bsize.efi = 0;
633
634 fd = open(rdev, O_RDONLY);
635 if (fd < 0)
636 return;
637
638 if (ioctl(fd, NSCIOC_BSIZE, &bsize) < 0) {
639 if (dki_info.dki_partition != (ushort_t)-1) {
640 /* assume part# is ok and just the size failed */
641 *partitionp = (int)dki_info.dki_partition;
642
643 #ifdef DKIOCPARTITION
644 /* see if this is an EFI label */
645 bzero(&p64, sizeof (p64));
646 p64.p_partno = (uint_t)*partitionp;
647 if ((ioctl(fd, DKIOCPARTITION, &p64)) > 0) {
648 *size = (uint64_t)p64.p_size;
649 } else {
650 bsize.p64 = (uint64_t)(unsigned long)&p64;
651 bsize.efi = 1;
652
653 if (ioctl(fd, NSCIOC_BSIZE, &bsize) < 0) {
654 /* see if this is a zvol */
655 zvol_bsize(path, size, *partitionp);
656 } else {
657 *size = (uint64_t)p64.p_size;
658 }
659 }
660 #endif /* DKIOCPARTITION */
661 }
662
663 (void) close(fd);
664 return;
665 }
666
667 (void) close(fd);
668
669 *partitionp = (int)dki_info.dki_partition;
670
671 if (vtoc.v_sanity != VTOC_SANE)
672 return;
673
674 if (vtoc.v_version != V_VERSION && vtoc.v_version != 0)
675 return;
676
677 if (dki_info.dki_partition > V_NUMPAR)
678 return;
679
680 *size = (uint64_t)vtoc.v_part[(int)dki_info.dki_partition].p_size;
681 }
682
683
684 static int
iscluster(void)685 iscluster(void)
686 {
687 /*
688 * Find out if we are running in a cluster
689 */
690 cl_nodeid = cfg_iscluster();
691 if (cl_nodeid > 0) {
692 return (TRUE);
693 } else if (cl_nodeid == 0) {
694 return (FALSE);
695 }
696
697 (void) fprintf(stderr, "%s\n",
698 gettext("nskernd: unable to ascertain environment"));
699 exit(1);
700 /* NOTREACHED */
701 }
702
703 /*
704 * Runtime Solaris release checking - build release == runtime release
705 * is always considered success, so only keep entries in the map for
706 * the special cases.
707 */
708 static nsc_release_t nskernd_rel_map[] = {
709 /* { "5.10", "5.10" }, */
710 { "5.11", "5.10" },
711 { NULL, NULL }
712 };
713
714
715 #ifdef lint
716 #define main nskernd_main
717 #endif
718 /* ARGSUSED1 */
719 int
main(int argc,char * argv[])720 main(int argc, char *argv[])
721 {
722 const char *dir = "/";
723 struct nskernd data;
724 struct rlimit rl;
725 int i, run, rc;
726 int partition;
727 char *reqd;
728 int syncpipe[2];
729 int startup;
730
731 (void) setlocale(LC_ALL, "");
732 (void) textdomain("nskernd");
733
734 rc = nsc_check_release(BUILD_REV_STR, nskernd_rel_map, &reqd);
735 if (rc < 0) {
736 (void) fprintf(stderr,
737 gettext("nskernd: unable to determine the current "
738 "Solaris release: %s\n"), strerror(errno));
739 exit(1);
740 } else if (rc == FALSE) {
741 (void) fprintf(stderr,
742 gettext("nskernd: incorrect Solaris release "
743 "(requires %s)\n"), reqd);
744 exit(1);
745 }
746
747 rc = 0;
748
749 if (argc != 1)
750 usage();
751
752 /*
753 * Usage: <progname> [-g] [-d <seconds to delay>]
754 */
755 while ((i = getopt(argc, argv, "gd:")) != EOF) {
756 switch (i) {
757 case 'g':
758 display_msg = 1;
759 break;
760 case 'd':
761 delay_time = atoi(optarg);
762 if (delay_time <= 0) {
763 delay_time = 30;
764 }
765 break;
766 default:
767 syslog(LOG_ERR,
768 "Usage: nskernd [-g] [-d <seconds to delay>]");
769 exit(1);
770 break;
771 }
772 }
773
774 if (chroot(dir) < 0) {
775 (void) fprintf(stderr, gettext("nskernd: chroot failed: %s\n"),
776 strerror(errno));
777 exit(1);
778 }
779
780 if (chdir(dir) < 0) {
781 (void) fprintf(stderr, gettext("nskernd: chdir failed: %s\n"),
782 strerror(errno));
783 exit(1);
784 }
785
786 /*
787 * Determine if we are in a Sun Cluster or not, before fork'ing
788 */
789 (void) iscluster();
790
791 /*
792 * create a pipe to synchronise the parent with the
793 * child just before it enters its service loop.
794 */
795 if (pipe(syncpipe) < 0) {
796 (void) fprintf(stderr,
797 gettext("nskernd: cannot create pipe: %s\n"),
798 strerror(errno));
799 exit(1);
800 }
801 /*
802 * Fork off a child that becomes the daemon.
803 */
804
805 if ((rc = fork()) > 0) {
806 char c;
807 int n;
808 (void) close(syncpipe[1]);
809 /*
810 * wait for the close of the pipe.
811 * If we get a char back, indicates good
812 * status from child, so exit 0.
813 * If we get a zero length read, then the
814 * child has failed, so we do too.
815 */
816 n = read(syncpipe[0], &c, 1);
817 exit((n <= 0) ? 1 : 0);
818 } else if (rc < 0) {
819 (void) fprintf(stderr, gettext("nskernd: cannot fork: %s\n"),
820 strerror(errno));
821 exit(1);
822 }
823
824 /*
825 * In child - become daemon.
826 */
827
828 /* use closefrom(3C) from PSARC/2000/193 when possible */
829 for (i = 0; i < syncpipe[1]; i++) {
830 (void) close(i);
831 }
832 closefrom(syncpipe[1] + 1);
833
834 (void) open("/dev/console", O_WRONLY|O_APPEND);
835 (void) dup(0);
836 (void) dup(0);
837 (void) close(0);
838
839 (void) setpgrp();
840
841 /*
842 * Ignore all signals apart from SIGTERM.
843 */
844
845 for (i = 1; i < _sys_nsig; i++)
846 (void) sigset(i, SIG_IGN);
847
848 (void) sigset(SIGTERM, sighand);
849
850 /*
851 * Increase the number of fd's that can be open.
852 */
853
854 rl.rlim_cur = RLIM_INFINITY;
855 rl.rlim_max = RLIM_INFINITY;
856 if (setrlimit(RLIMIT_NOFILE, &rl) < 0) {
857 (void) fprintf(stderr,
858 gettext("nskernd: could not increase RLIMIT_NOFILE: %s\n"),
859 strerror(errno));
860 (void) fprintf(stderr,
861 gettext("nskernd: the maximum number of nsctl open "
862 "devices may be reduced\n"));
863 }
864
865 /*
866 * Open /dev/nsctl and startup.
867 */
868
869 nsctl_fd = open(rdev, O_RDONLY);
870 if (nsctl_fd < 0) {
871 (void) fprintf(stderr, gettext("nskernd: unable to open %s\n"),
872 rdev);
873 exit(1);
874 }
875
876 bzero(&data, sizeof (data));
877
878 data.command = NSKERND_START;
879 data.data1 = (uint64_t)cl_nodeid;
880 run = 1;
881
882 startup = 1;
883 while (run) {
884 rc = ioctl(nsctl_fd, NSCIOC_NSKERND, &data);
885 if (rc < 0) {
886 /* try and do kernel cleanup and exit */
887 if (shutdown()) {
888 run = 0;
889 } else {
890 sigterm = 0;
891 }
892
893 (void) fprintf(stderr,
894 gettext("nskernd: NSCIOC_NSKERND failed: %s\n"),
895 strerror(errno));
896 continue;
897 } else if (sigterm) {
898 /* SIGTERM received - terminate */
899 if (data.command != NSKERND_START &&
900 (data.command != NSKERND_STOP ||
901 data.data1 != (uint64_t)1)) {
902 /* need to do kernel cleanup */
903 if (shutdown()) {
904 run = 0;
905 } else {
906 sigterm = 0;
907 data.command = NSKERND_START;
908 data.data1 = (uint64_t)cl_nodeid;
909 }
910 } else {
911 /* just quit */
912 if (canshutdown()) {
913 run = 0;
914 } else {
915 /* cannot shutdown - threads active */
916 sigterm = 0;
917 data.command = NSKERND_START;
918 data.data1 = (uint64_t)cl_nodeid;
919 }
920 }
921 continue;
922 }
923 if (startup) {
924 char c = 0;
925 (void) write(syncpipe[1], &c, 1);
926 (void) close(syncpipe[1]);
927 startup = 0;
928 }
929 switch (data.command) {
930 case NSKERND_START: /* (re)start completion */
931 if (rc == 1) {
932 (void) fprintf(stderr,
933 gettext("nskernd: already started\n"));
934 run = 0;
935 } else if (rc == 2) {
936 (void) fprintf(stderr,
937 gettext("nskernd: stopped by kernel\n"));
938 run = 0;
939 }
940 data.command = NSKERND_WAIT;
941 break;
942
943 case NSKERND_STOP: /* kernel telling daemon to stop */
944 if (data.data1 != (uint64_t)1) {
945 (void) shutdown();
946 run = 0;
947 }
948 break;
949
950 case NSKERND_BSIZE:
951 /*
952 * kernel requesting partsize
953 * data1 - size return
954 * data2 - raw_fd (entry)
955 * - partition number (return)
956 */
957 partition = -1;
958 get_bsize(data.data2, &data.data1,
959 &partition, data.char1);
960 data.data2 = (uint64_t)partition;
961 data.command = NSKERND_WAIT;
962 break;
963
964 case NSKERND_NEWLWP: /* kernel requesting a new LWP */
965 newlwp(&data);
966 data.command = NSKERND_WAIT;
967 break;
968
969 case NSKERND_LOCK: /* kernel requesting lock */
970 dolock(&data);
971 data.command = NSKERND_WAIT;
972 break;
973
974 case NSKERND_WAIT: /* kernel retrying wait */
975 /*
976 * the kernel thread can be woken by the dr config
977 * utilities (ie cfgadm) therefore we just reissue
978 * the wait.
979 */
980 break;
981
982 case NSKERND_IIBITMAP:
983 rc = log_iibmp_err(data.char1, (int)data.data1);
984 data.data1 = (uint64_t)rc;
985 data.command = NSKERND_WAIT;
986 break;
987
988 default:
989 (void) fprintf(stderr,
990 gettext("nskernd: unknown command %d"),
991 data.command);
992 data.command = NSKERND_WAIT;
993 break;
994 }
995 }
996
997 (void) close(nsctl_fd);
998
999 return (rc);
1000 }
1001